In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [5]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["strict_relations"]]
    ["ItemCFEmbed.1024"] # 0.12%
    ["UserCF.256"] # 0.18%
]

15-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.strict_relations"
 "ItemCFEmbed.1024"
 "UserCF.256"

In [6]:
β = get_indep("validation", alphas) \ get_dep("validation")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:21[39m


15-element Vector{Float64}:
 0.9915105339507934
 0.10254886875521553
 0.09787259267447602
 0.020612731821270627
 0.31986255002604275
 0.005454023205743808
 0.03989031514846255
 0.17609009549520513
 1.5825382028485147
 0.15385000641646215
 0.14219680922421626
 0.1422098479910885
 3.8177026296046708
 0.12025541716624263
 0.9162826622702647

In [7]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [8]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:23[39m


(1.0906643819307333, 0.7869540685696844, 0.624881957745506, 1.1895487940123486)

In [9]:
# 1.0906643819307333

## Train an XGBoost model

In [10]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [11]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 0.9
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        300,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [12]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [None]:
bst = train_model("validation", get_xgboost_dep("validation"));

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 0.59 μs/it)[39mit)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.21 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 0.48 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 1.13 μs/it)[39m/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.10 ms/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 1.03 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 1.15 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 1.04 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.37 μs/it)[39m

In [None]:
#[100]	train-rmse:1.074687
#(1.0776735836062472, 0.7779856850061714, 0.11236783f0)

# [300]	train-rmse:1.066483
# 1.0753573535488306

In [None]:
@debug "XGBoost model test (rmse, mae) = $(evaluate(bst))"

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
bst = XGBoost.Booster(model_file = xgboost_model_fn)

In [None]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = abs.(Y_test - preds)
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [None]:
errors, test_preds = get_error(bst, "test");

In [None]:
extra_error_features = hcat(test_preds, count_feature("test", true; source = "validation"));

In [None]:
error_bst = train_model("test", errors.rating, extra_error_features);

In [None]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [None]:
# [300]	train-rmse:0.687583	test-rmse:0.692817

## Save params

In [None]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);