In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [57]:
alphas = [
    # Main signals
    ["UserItemBiases"]                                 # 1.3181877506606918
    ["GNN.$K" for K = 1:1]                             # 1.1906560031606823
    ["ItemCF.Resid.$alpha.1.256" for alpha in ["GNN"]] # 1.132600635774328
    ["MatrixFactorization.$K" for K in [10, 20, 40]]   # 1.1272808153857012
    # # Tier 2 signals ( under 10 bps)
    ["ItemCF.$K" for K in [2^4, 2^6, 2^8, 2^10, 2^12]] # 1.1265994352956394  
    ["ItemCF.Resid.$K.1.256" for K in ["ItemCF"]]      # 1.1257626649698071   
    ["ItemCF.Resid.$K.1.256" for K in ["MF"]]          # 1.1253543890469837
    ["GNN.$K.Implicit" for K = 1:1]                    # 1.1248293474603284
    ["ItemCF.Resid.All.2.256"]                         # 1.124193298929656
    ["UserCF.$K" for K in [256]] # 1 bps               # 1.1241038049699092
    # # Tier 3 signals
    ["ItemCF.Related.strict_relations"] # 0 bps
    # # Experimental Signals
    ["GNN2.$K" for K = 1:1]                            # 1.122060394525166
    ["GNN3.$K" for K = 1:1]                            # 1.1217360476653944
    ["GNN.Rating.$K" for K = 1:2]                      # 1.1208090298543265
]

21-element Vector{String}:
 "UserItemBiases"
 "GNN.1"
 "ItemCF.Resid.GNN.1.256"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCF.4096"
 "ItemCF.Resid.ItemCF.1.256"
 "ItemCF.Resid.MF.1.256"
 "GNN.1.Implicit"
 "ItemCF.Resid.All.2.256"
 "UserCF.256"
 "ItemCF.Related.strict_relations"
 "GNN2.1"
 "GNN3.1"
 "GNN.Rating.1"
 "GNN.Rating.2"

In [58]:
β = get_indep("validation", alphas) \ get_dep("validation")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:10[39m


21-element Vector{Float64}:
  0.994355089739851
  0.33883046399153754
  0.565401029934941
  0.0960868288926458
  0.02943074028042504
  0.04781073429226963
 -0.015649582313815913
  0.07606887896327719
 -0.007668759740711459
 -0.13376886809440539
  0.2192141727643943
  0.6744092468971044
  0.3149980474144776
  7.619386708064723
  0.14356710544974838
  0.0652708925548949
 -0.014829387573519467
  0.12179693149107637
  0.04123801886845221
  0.13587008224171196
  0.1239648759875794

In [59]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [60]:
evaluate(get_indep("validation", alphas), get_dep("validation"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


(1.119792853800615, 0.8052825054532469, 0.5747782057793128, 1.2539360354229256)

In [61]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:10[39m


(1.1206490135973346, 0.8056428025840172, 0.5743141002704348, 1.255854211676679)

In [56]:
1.1791349318111632 / 1.1802368829329453

0.9990663305496404

In [50]:
1.1208090298543265 / 1.1209761862123244

0.9998508832212014

In [12]:
# SOTA: 1.1209761862123244

In [13]:
# OLD RESULTS: 1.0865915182160761 

## Train an XGBoost model

In [14]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [15]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 0.9
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        100,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [16]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [17]:
bst = train_model("validation", get_xgboost_dep("validation"));

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 0.46 μs/it)[39m/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.40 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.87 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (96.18 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.28 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.86 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (85.53 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.86 μs/it)[39m
[32mPr

In [18]:
# SOTA: [100]	train-rmse:1.110816	test-rmse:1.116032

In [19]:
@debug "XGBoost model test (rmse, mae, mean(abs(bst))) = $(evaluate(bst))"

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 0.43 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.38 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.85 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:07 ( 0.83 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.87 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.74 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.61 μs/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220402 02:38:32 nsfw categories: String7["gray", "white"]
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 

In [20]:
# SOTA: (rmse, mae, mean(abs(bst))) = (1.1112835080064754, 0.7997173869660834, 0.094115466f0)

In [21]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [22]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
bst = XGBoost.Booster(model_file = xgboost_model_fn);

In [23]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = abs.(Y_test - preds)
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [24]:
errors, test_preds = get_error(bst, "test");

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


In [25]:
error_bst = train_model("test", errors.rating, test_preds);

[1]	train-rmse:0.783657	test-rmse:0.787480
[2]	train-rmse:0.760619	test-rmse:0.764573
[3]	train-rmse:0.748251	test-rmse:0.752300
[4]	train-rmse:0.741345	test-rmse:0.745437
[5]	train-rmse:0.737062	test-rmse:0.741226
[6]	train-rmse:0.734534	test-rmse:0.738752
[7]	train-rmse:0.732906	test-rmse:0.737223
[8]	train-rmse:0.731306	test-rmse:0.735717
[9]	train-rmse:0.730313	test-rmse:0.734734
[10]	train-rmse:0.729530	test-rmse:0.734010
[11]	train-rmse:0.728952	test-rmse:0.733497
[12]	train-rmse:0.728419	test-rmse:0.732978
[13]	train-rmse:0.727503	test-rmse:0.732112
[14]	train-rmse:0.727039	test-rmse:0.731718
[15]	train-rmse:0.726686	test-rmse:0.731409
[16]	train-rmse:0.726311	test-rmse:0.731062
[17]	train-rmse:0.725962	test-rmse:0.730785
[18]	train-rmse:0.725677	test-rmse:0.730522
[19]	train-rmse:0.725284	test-rmse:0.730189
[20]	train-rmse:0.724862	test-rmse:0.729826
[21]	train-rmse:0.724647	test-rmse:0.729628
[22]	train-rmse:0.724390	test-rmse:0.729406
[23]	train-rmse:0.724000	test-rmse:0.7290

In [26]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [27]:
# SOTA: [100]	train-rmse:0.715822	test-rmse:0.723105

In [28]:
# OLD RESULTS: [300]	train-rmse:0.687583	test-rmse:0.692817

## Save params

In [29]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);