# Bayesian Personalized Ranking Trees
* Creates a model for pairwise classification

In [2]:
using LightGBM
import NBInclude: @nbinclude
@nbinclude("BPRBase.ipynb")
@nbinclude("EnsembleInputs.ipynb");

┌ Info: lib_lightgbm not found in system dirs, trying fallback
└ @ LightGBM /home/kundan/.julia/packages/LightGBM/A7zVd/src/LightGBM.jl:25


## Lightgbm Datasets

In [3]:
# TODO move to a shared LGBM package

function augment_dataset(ds, y, w)
    LightGBM.LGBM_DatasetSetField(ds, "label", y)
    LightGBM.LGBM_DatasetSetField(ds, "weight", w)
    ds
end

function create_train_dataset(X, y, w, estimator)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(X, LightGBM.stringifyparams(estimator), false),
        y,
        w,
    )
end

function create_test_dataset(X, y, w, estimator, train_ds)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(
            X,
            LightGBM.stringifyparams(estimator),
            train_ds,
            false,
        ),
        y,
        w,
    )
end;

In [4]:
function get_pairwise_dataset(
    split,
    user_features,
    training;
    batch_size = 1024,
    epochs = 10000,
)
    @info "getting pairwise dataset"
    Xs = []
    ys = []
    @showprogress for _ = 1:epochs
        batch = get_batch(split, user_features, batch_size, training = training)
        push!(Xs, cpu(batch[1][1])')
        push!(ys, cpu(batch[1][2])')
    end
    X = vcat(Xs...)
    y = vec(vcat(ys...))
    w = copy(y)
    w .= 1
    X, y, w
end;

## Save Model

In [5]:
function train_alpha(outdir, allow_ptw)
    set_logging_outdir(outdir)
    alphas = [
        # explicit_raw_alphas
        # implicit_raw_alphas
        # nondirectional_raw_alphas
        ["Explicit", "NonlinearImplicit"]
        String[]
    ]
    if allow_ptw
        append!(alphas, ["NonlinearPtw"])
    end

    training, test, user_features = get_data(alphas, allow_ptw)
    estimator = LGBMClassification(
        objective = "binary",
        num_iterations = 100,
        learning_rate = 0.01,
        early_stopping_round = 10,
        feature_fraction = 0.8,
        bagging_fraction = 0.9,
        bagging_freq = 1,
        num_leaves = 1000,
        num_class = 1,
        metric = ["auc", "binary_logloss"],
    )
    X_train, y_train, w_train = get_pairwise_dataset(training, user_features, false)
    X_test, y_test, w_test = get_pairwise_dataset(test, user_features, false)

    train_ds = create_train_dataset(X_train, y_train, w_train, estimator)
    test_ds = create_test_dataset(X_test, y_test, w_test, estimator, train_ds)
    fit!(estimator, train_ds, test_ds)
    write_params(Dict("model" => estimator, "alphas" => alphas), outdir)
end;

In [5]:
train_alpha("BPR.tree", false)

[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 20:56:44 getting user features
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 20:56:46 getting test explicit alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.78 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 20:56:50 getting test implicit alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.70 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 20:56:51 getting test negative alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:25 ( 1.44 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:01:47 ( 6.15 μs/it)[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:01[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:00:21 adding priorities for test explicit
[3

[LightGBM] [Info] Number of positive: 5118623, number of negative: 5121377
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 10240000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499866 -> initscore=-0.000538
[LightGBM] [Info] Start training from score -0.000538
Iteration: 1, test_1's auc: 0.9958026099163999, 
Iteration: 1, test_1's binary_logloss: 0.6841984779872886
Iteration: 2, test_1's auc: 0.9928208317545321, 
Iteration: 2, test_1's binary_logloss: 0.6767344414773862
Iteration: 3, test_1's auc: 0.9948635561690041, 
Iteration: 3, test_1's binary_logloss: 0.6681087734500963
Iteration: 4, test_1's auc: 0.9954056575146946, 
Iteration: 4, test_1's binary_logloss: 0.6596508676074182
Iteration: 5, test_1's auc: 0.9959501719843429, 
Iteration: 5, test_1's binary_logloss: 0.6525824895205073
Itera

In [6]:
train_alpha("BPR.tree.ptw", true)

[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:37:24 getting user features
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:37:26 getting test explicit alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.83 μs/it)[39mm
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:37:30 getting test implicit alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.76 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:37:30 getting test negative alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:25 ( 1.44 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:01:43 ( 5.93 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220827 21:39:55 getting test ptw alphas
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.91 μs/it)[39m
[32mProgress:

[LightGBM] [Info] Number of positive: 5120018, number of negative: 5119982
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 10240000, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initscore=0.000007
[LightGBM] [Info] Start training from score 0.000007
Iteration: 1, test_1's auc: 0.9778567849644422, 
Iteration: 1, test_1's binary_logloss: 0.6853845618756418
Iteration: 2, test_1's auc: 0.9881626177427713, 
Iteration: 2, test_1's binary_logloss: 0.6773676314688329
Iteration: 3, test_1's auc: 0.9867320878557668, 
Iteration: 3, test_1's binary_logloss: 0.6699008520025274
Iteration: 4, test_1's auc: 0.9901060056388191, 
Iteration: 4, test_1's binary_logloss: 0.6625391474348097
Iteration: 5, test_1's auc: 0.9905243124447334, 
Iteration: 5, test_1's binary_logloss: 0.6549500877006545
Iterati

In [7]:
# Iteration: 1000, test_1's binary_logloss: 0.11105721870720313, 
# Iteration: 1000, test_1's auc: 0.9924296197751328