In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [7]:
alphas = [
    # Main signals
    ["UserItemBiases"]                                 # 1.3181877506606918
    ["GNN.$K" for K = 1:1]                             # 1.1906560031606823
    ["ItemCF.Resid.$alpha.1.256" for alpha in ["GNN"]] # 1.132600635774328
    ["MatrixFactorization.$K" for K in [10, 20, 40]]   # 1.1272808153857012
    # # Tier 2 signals ( under 10 bps)
    ["ItemCF.$K" for K in [2^4, 2^6, 2^8, 2^10, 2^12]] # 1.1265994352956394  
    ["ItemCF.Resid.$K.1.256" for K in ["ItemCF"]]      # 1.1257626649698071   
    ["ItemCF.Resid.$K.1.256" for K in ["MF"]]          # 1.1253543890469837
    ["GNN.$K.Implicit" for K = 1:1]                    # 1.1248293474603284
    ["ItemCF.Resid.All.2.256"]                         # 1.124193298929656
    ["UserCF.$K" for K in [256]] # 1 bps    
    # #["GNN.Resid.1"] # TODO fix
    # # Tier 3 signals
    ["ItemCF.Related.strict_relations"] # 0 bps
]

17-element Vector{String}:
 "UserItemBiases"
 "GNN.1"
 "ItemCF.Resid.GNN.1.256"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCF.4096"
 "ItemCF.Resid.ItemCF.1.256"
 "ItemCF.Resid.MF.1.256"
 "GNN.1.Implicit"
 "ItemCF.Resid.All.2.256"
 "UserCF.256"
 "ItemCF.Related.strict_relations"

In [10]:
β = get_indep("validation", alphas) \ get_dep("validation")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:11[39m


17-element Vector{Float64}:
  0.9934173358549976
  0.6121894953343386
  0.5665004089626174
  0.12292481545372368
  0.06147794783309547
  0.066433827351831
 -0.011900478488508975
  0.08768225408881886
  0.004640240278894954
 -0.14898878753284156
  0.2657140722350367
  0.8187020162211207
  0.2772115054988029
 11.043053961963476
  0.13937776085423287
  0.09594069465973559
 -0.009944736065177678

In [11]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [None]:
(inference_dep - inference * β)

In [None]:
evaluate(get_indep("validation", alphas), get_dep("validation"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


(1.123303380402306, 0.8073476874565486, 0.5721079042345558, 1.261810484423248)

In [None]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

In [None]:
# SOTA: 1.1241038049699092

In [None]:
# OLD RESULTS: 1.0865915182160761 

## Train an XGBoost model

In [None]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [None]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 0.9
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        100,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [None]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [None]:
bst = train_model("validation", get_xgboost_dep("validation"));

In [None]:
# SOTA: [100]	train-rmse:1.110816	test-rmse:1.116032

In [None]:
@debug "XGBoost model test (rmse, mae, mean(abs(bst))) = $(evaluate(bst))"

In [None]:
# SOTA: (rmse, mae, mean(abs(bst))) = (1.1132486377797628, 0.8011315065494479, 0.10305781f0)

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [None]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
bst = XGBoost.Booster(model_file = xgboost_model_fn);

In [None]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = abs.(Y_test - preds)
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [None]:
errors, test_preds = get_error(bst, "test");

In [None]:
error_bst = train_model("test", errors.rating, test_preds);

In [None]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [None]:
# SOTA: [100]	train-rmse:0.722286	test-rmse:0.729122

In [None]:
# OLD RESULTS: [300]	train-rmse:0.687583	test-rmse:0.692817

## Save params

In [None]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);