In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [66]:
alphas = [
    ["UserItemBiases"]
    #    ["GNN"]
    ["GNN.$K" for K = 2:2]
    # ["ItemCF.$K" for K in [2^4, 2^6, 2^8, 2^10, 2^12]]
    # ["MatrixFactorization.$K" for K in [10, 20, 40]] # TODO make 8,16,32
    # ["ItemCF.Resid.$alpha.1.$K" for alpha in ["ItemCF", "GNN", "MF"] for K in [2^8]]
    # ["GNN.Resid.$K" for K = 1:1] # 0.1%
    # ["ItemCF.Embed.$K" for K in ["256"]] # 0.0% TODO fix    
    # ["ItemCF.Related.$name" for name in ["strict_relations"]] # 0.0% TODO fix
    # ["UserCF.$K" for K in [2^8]] # 0.0% TODO fix    
]

2-element Vector{String}:
 "UserItemBiases"
 "GNN.2"

In [67]:
β = get_indep("validation", alphas) \ get_dep("validation")

2-element Vector{Float64}:
 0.9996295146297729
 1.079251085945343

In [68]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [69]:
evaluate(get_indep("validation", alphas), get_dep("validation"), β)

(1.20766801232553, 0.893792995885124, 0.5054214706116913, 1.4584620279942964)

In [70]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

(1.2078902060877865, 0.8937317412430557, 0.5054559758548471, 1.4589987499627954)

In [52]:
1.1247956469883074 / 1.1264557187221176

0.9985262876238993

In [17]:
1.1264557187221176 / 1.1275709407458185

0.9990109517872434

In [11]:
# SOTA: 1.1264557187221176

In [None]:
# OLD RESULTS: 1.0865915182160761 

## Train an XGBoost model

In [None]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [None]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 0.9
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        100,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [None]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [None]:
bst = train_model("validation", get_xgboost_dep("validation"));

In [None]:
# SOTA: [100]	train-rmse:1.111555	test-rmse:1.116668

In [None]:
# OLD RESULTS: [300]	train-rmse:1.064098	test-rmse:1.072669
# (1.0730632033877396, 0.7742225118286815, 0.12442561f0)

In [None]:
@debug "XGBoost model test (rmse, mae, mean(abs(bst))) = $(evaluate(bst))"

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
bst = XGBoost.Booster(model_file = xgboost_model_fn);

In [None]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = abs.(Y_test - preds)
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [None]:
errors, test_preds = get_error(bst, "test");

In [None]:
error_bst = train_model("test", errors.rating, test_preds);

In [None]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [None]:
# SOTA: [100]	train-rmse:0.720068	test-rmse:0.726745

In [None]:
# OLD RESULTS: [300]	train-rmse:0.687583	test-rmse:0.692817

## Save params

In [None]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);