In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [5]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["strict_relations"]]
    ["ItemCFEmbed.1024"] # 0.12%
    ["UserCF.256"] # 0.18%
    ["GNN"] # 0.40%
]

16-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.strict_relations"
 "ItemCFEmbed.1024"
 "UserCF.256"
 "GNN"

In [6]:
β = get_indep("validation", alphas) \ get_dep("validation")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:24[39m


16-element Vector{Float64}:
  0.9904704580363862
  0.10990674868178746
  0.08610537930319749
  0.016653407254069386
  0.28059270546023146
 -0.004790322385921639
  0.05493360384640813
  0.16709922284734302
  1.3716677290922157
  0.0850362234214797
  0.08895677730957696
  0.11736735932603201
  4.196287881157939
  0.11755552883067967
  0.8575211574848711
  0.26797949930994275

In [7]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [8]:
evaluate(get_indep("validation", alphas), get_dep("validation"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


(1.0863929404046642, 0.7843237995073383, 0.6277475610346602, 1.180249620961092)

In [9]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:24[39m


(1.0865915182160761, 0.7845045400498323, 0.627678330398362, 1.1806811274591171)

In [10]:
# 1.0865915182160761 

## Train an XGBoost model

In [11]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [12]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 0.9
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        300,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [13]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [14]:
bst = train_model("validation", get_xgboost_dep("validation"));

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 0.48 μs/it)[39mit)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.39 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.83 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.81 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.83 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.31 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.15 μs/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 20:23:02 nsfw categories: 

In [15]:
# [300]	train-rmse:1.064098	test-rmse:1.072669
# (1.0730632033877396, 0.7742225118286815, 0.12442561f0)

In [16]:
@debug "XGBoost model test (rmse, mae) = $(evaluate(bst))"

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.45 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.40 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.82 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.81 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.13 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.12 μs/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 20:54:47 nsfw categories: String7["white"]
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 1.89 μs/

In [17]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [18]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
bst = XGBoost.Booster(model_file = xgboost_model_fn)

XGBoost.Booster(Ptr{Nothing} @0x000000001e79e580)

In [19]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = abs.(Y_test - preds)
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [20]:
errors, test_preds = get_error(bst, "test");

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


In [26]:
error_bst = train_model("test", errors.rating);

[1]	train-rmse:0.754402	test-rmse:0.753478
[2]	train-rmse:0.734739	test-rmse:0.733891
[3]	train-rmse:0.724126	test-rmse:0.723325
[4]	train-rmse:0.717872	test-rmse:0.717136
[5]	train-rmse:0.714267	test-rmse:0.713614
[6]	train-rmse:0.711727	test-rmse:0.711146
[7]	train-rmse:0.709842	test-rmse:0.709298
[8]	train-rmse:0.708177	test-rmse:0.707682
[9]	train-rmse:0.707260	test-rmse:0.706798
[10]	train-rmse:0.706469	test-rmse:0.706051
[11]	train-rmse:0.705861	test-rmse:0.705500
[12]	train-rmse:0.705262	test-rmse:0.704933
[13]	train-rmse:0.704691	test-rmse:0.704403
[14]	train-rmse:0.704326	test-rmse:0.704060
[15]	train-rmse:0.703931	test-rmse:0.703715
[16]	train-rmse:0.703431	test-rmse:0.703261
[17]	train-rmse:0.703033	test-rmse:0.702882
[18]	train-rmse:0.702556	test-rmse:0.702431
[19]	train-rmse:0.702135	test-rmse:0.702046
[20]	train-rmse:0.701572	test-rmse:0.701525
[21]	train-rmse:0.701295	test-rmse:0.701279
[22]	train-rmse:0.701056	test-rmse:0.701058
[23]	train-rmse:0.700863	test-rmse:0.7009

In [27]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [28]:
# [300]	train-rmse:0.687583	test-rmse:0.692817

## Save params

In [29]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);