In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using DataFrames
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [5]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["all"]]
    # ["UserCF.1024"]
    ["ItemCFEmbed.1024"] # 0.12%
]

14-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.all"
 "ItemCFEmbed.1024"

In [6]:
β = get_indep("validation", alphas) \ get_dep("validation")

14-element Vector{Float64}:
 0.991941501581278
 0.10065864678782964
 0.101254633495204
 0.027979284197341876
 0.3084945753024404
 0.009298704395049765
 0.024201408984978807
 0.16357032412742567
 1.60804410709948
 0.15410659871205612
 0.1398040394323862
 0.13999048590054403
 3.859510613782815
 0.1230795003983232

In [7]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [8]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

(1.0925873446799506, 0.7889391360879495, 0.6235580419132811, 1.1937471057547853)

## Train an XGBoost model

In [9]:
function get_augmented_indep(split, alphas)
    hcat(get_indep(split, alphas), get_xgboost_features(split))
end;

In [10]:
function get_xgboost_split(training_perc)
    Random.seed!(20220104)
    val_rows = length(get_dep("validation"))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    X = convert.(Float32, get_augmented_indep("validation", alphas))
    Y = convert.(Float32, get_dep("validation") - get_indep("validation", alphas) * β)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [11]:
function fast_test_mse()
    # TODO early stopping
    dtrain, dtest = get_xgboost_split(0.9999)
    bst = XGBoost.xgboost(
        dtrain,
        200,
        watchlist = [(dtrain, "train"), (dtest, "test")],
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [12]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas))
    Y_test = convert.(Float32, get_dep("test"))
    preds = XGBoost.predict(bst, X_test)
    preds += get_indep("test", alphas) * β
    rmse(Y_test, clamp.(preds, 1, 10))
end;

In [None]:
bst = fast_test_mse();

[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 0.57 μs/it)[39m39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (38.98 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (66.69 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (39.91 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (65.27 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (39.96 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.49 μs/it)[39mit)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.17 μs/it)[39mm
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.82 μs/it)[39m
[1]	train-rmse:1.145249	test-rmse:1.144691
[2]	train-rmse:1.116258	test-rmse:1.110682
[3]	train-rmse:1.101579	test-rmse:1.092922
[4]	train-rmse:1.094043	test-rmse:1.083369
[5]	train-rmse:1.090222	test-rmse:1.077469
[6]	train-rmse:1.088110	test-rmse:1

In [None]:
# [200]	train-rmse:1.074031	test-rmse:1.079130
# 1.080729819370984

In [None]:
@info "XGBoost model test rmse $(evaluate(bst))"

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)
write_params(Dict("β" => β, "alphas" => alphas, "bst" => xgboost_model_fn));