In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [5]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["all"]]
    ["UserCF.256"] # 0.18%
    ["ItemCFEmbed.1024"] # 0.12%
]

15-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.all"
 "UserCF.256"
 "ItemCFEmbed.1024"

In [6]:
β = get_indep("validation", alphas) \ get_dep("validation")

15-element Vector{Float64}:
 0.9914486118185715
 0.10588654775680703
 0.09664928759104498
 0.014671622978912145
 0.32337929785979685
 0.004183635362108991
 0.03690511858119151
 0.17652059650310126
 1.5818494637588048
 0.15294316034544914
 0.14121783033991342
 0.1425234791555824
 3.931713863450469
 0.9167415209044486
 0.12299620313042041

In [7]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [8]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

(1.090570235049966, 0.786988102932762, 0.6249467158322726, 1.1893434375769378)

In [9]:
1 - 1.090570235049966 / 1.0925873446799506

0.00184617700342804

## Train an XGBoost model

In [10]:
function get_augmented_indep(split, alphas)
    hcat(get_indep(split, alphas), get_xgboost_features(split))
end;

In [11]:
function get_xgboost_split(training_perc)
    Random.seed!(20220104)
    val_rows = length(get_dep("validation"))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    X = convert.(Float32, get_augmented_indep("validation", alphas))
    Y = convert.(Float32, get_dep("validation") - get_indep("validation", alphas) * β)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [12]:
function fast_test_mse()
    # TODO early stopping
    dtrain, dtest = get_xgboost_split(0.8)
    bst = XGBoost.xgboost(
        dtrain,
        200,
        watchlist = [(dtrain, "train"), (dtest, "test")],
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [13]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas))
    Y_test = convert.(Float32, get_dep("test"))
    preds = XGBoost.predict(bst, X_test)
    preds += get_indep("test", alphas) * β
    rmse(Y_test, clamp.(preds, 1, 10))
end;

In [14]:
bst = fast_test_mse();

[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 0.52 μs/it)[39m39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (35.30 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (65.89 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (37.99 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (64.59 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (39.59 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.47 μs/it)[39mit)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.13 μs/it)[39mm
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.83 μs/it)[39m
[1]	train-rmse:1.143663	test-rmse:1.142987
[2]	train-rmse:1.114728	test-rmse:1.114004
[3]	train-rmse:1.100077	test-rmse:1.099337
[4]	train-rmse:1.092622	test-rmse:1.091877
[5]	train-rmse:1.088743	test-rmse:1.088008
[6]	train-rmse:1.086761	test-rmse:1

In [15]:
# [200]	train-rmse:1.071622	test-rmse:1.076803
# 1.0787960391164935

In [16]:
@info "XGBoost model test rmse $(evaluate(bst))"

[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (46.64 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (35.39 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (63.44 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (38.09 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (61.52 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (39.07 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.06 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.87 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.75 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220108 05:11:29 XGBoost model test rmse 1.0787960391164935


In [17]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)
write_params(Dict("β" => β, "alphas" => alphas, "bst" => xgboost_model_fn));