In [1]:
name = "CombineSignals"
residual_alphas = [];

In [2]:
using Random
import XGBoost

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
@nbinclude("XGBoostFeatures.ipynb");

## Train a linear model

In [5]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["strict_relations"]]
    ["UserCF.256"] # 0.18%
    ["ItemCFEmbed.1024"] # 0.12%
]

15-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.strict_relations"
 "UserCF.256"
 "ItemCFEmbed.1024"

In [6]:
β = get_indep("validation", alphas) \ get_dep("validation")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:03[39m


15-element Vector{Float64}:
 0.9915105339507934
 0.10254886875521553
 0.09787259267447602
 0.020612731821270627
 0.31986255002604275
 0.005454023205743808
 0.03989031514846255
 0.17609009549520513
 1.5825382028485147
 0.15385000641646215
 0.14219680922421626
 0.1422098479910885
 3.8177026296046708
 0.9162826622702647
 0.12025541716624263

In [7]:
function evaluate(X, y, β)
    y_pred = X * β
    y_pred = clamp.(y_pred, 1, 10)
    rmse(y, y_pred), mae(y, y_pred), r2(y, y_pred), mse(y, y_pred)
end;

In [8]:
evaluate(get_indep("test", alphas), get_dep("test"), β)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:22[39m


(1.0906643819307333, 0.7869540685696844, 0.624881957745506, 1.1895487940123486)

## Train an XGBoost model

In [22]:
function get_xgboost_split(split, training_perc, Y, extra_features)
    Random.seed!(20220104)
    val_rows = length(get_dep(split))
    val_shuffle = shuffle(1:val_rows)
    val_train_size = Int(round(val_rows * training_perc))
    features = get_augmented_indep(split, alphas, β)
    if !isnothing(extra_features)
        features = hcat(features, extra_features)
    end
    X = convert.(Float32, features)
    X_val_train = X[val_shuffle[1:val_train_size], :]
    Y_val_train = Y[val_shuffle[1:val_train_size]]
    X_val_test = X[val_shuffle[val_train_size+1:end], :]
    Y_val_test = Y[val_shuffle[val_train_size+1:end]]
    (
        XGBoost.DMatrix(X_val_train, label = Y_val_train),
        XGBoost.DMatrix(X_val_test, label = Y_val_test),
    )
end;

In [23]:
function get_xgboost_dep(split)
    convert.(Float32, get_dep(split) - get_indep(split, alphas) * β)
end

function train_model(split, Y, extra_features = nothing)
    # TODO early stopping
    training_split_perc = 1.0
    dtrain, dtest = get_xgboost_split(split, training_split_perc, Y, extra_features)
    watchlist =
        training_split_perc == 1 ? [(dtrain, "train")] :
        [(dtrain, "train"), (dtest, "test")]
    XGBoost.xgboost(
        dtrain,
        100,
        watchlist = watchlist,
        objective = "reg:squarederror",
        nthread = Threads.nthreads(),
    )
end;

In [11]:
function evaluate(bst)
    X_test = convert.(Float32, get_augmented_indep("test", alphas, β))
    Y_test = convert.(Float32, get_dep("test"))
    xgboost_preds = XGBoost.predict(bst, X_test)
    preds = get_indep("test", alphas) * β + xgboost_preds
    rmse(Y_test, clamp.(preds, 1, 10)),
    mae(Y_test, clamp.(preds, 1, 10)),
    mean(abs.(xgboost_preds))
end;

In [12]:
bst = train_model("validation", get_xgboost_dep("validation"));

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.23 μs/it)[39mit)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.40 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.80 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.80 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.31 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.14 μs/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220116 20:08:14 nsfw categories: 

In [13]:
#[100]	train-rmse:1.074687
#(1.0776735836062472, 0.7779856850061714, 0.11236783f0)

# [300]	train-rmse:1.066483
# 1.0753573535488306

In [14]:
@debug "XGBoost model test (rmse, mae) = $(evaluate(bst))"

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.45 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.41 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.84 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.85 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.82 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.81 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.14 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 1.13 μs/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220116 20:21:30 nsfw categories: String7["white"]
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 1.88 μs/

In [15]:
xgboost_model_fn = "../../data/alphas/$name/xgb.model"
XGBoost.save(bst, xgboost_model_fn)

## Train a model to predict confidence intervals

In [26]:
function get_error(bst, split)
    X_test = convert.(Float32, get_augmented_indep(split, alphas, β))
    Y_test = convert.(Float32, get_dep(split))
    bst_preds = XGBoost.predict(bst, X_test)
    preds = bst_preds + get_indep(split, alphas) * β
    errors = (Y_test - preds) .^ 2
    df = get_split(split)
    RatingsDataset(df.user, df.item, errors), bst_preds
end;

In [27]:
errors, test_preds = get_error(bst, "test");

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


In [28]:
error_bst = train_model("test", errors.rating, test_preds);

[1]	train-rmse:2.760026
[2]	train-rmse:2.724047
[3]	train-rmse:2.704180
[4]	train-rmse:2.692170
[5]	train-rmse:2.684823
[6]	train-rmse:2.679880
[7]	train-rmse:2.675370
[8]	train-rmse:2.671283
[9]	train-rmse:2.668919
[10]	train-rmse:2.666616
[11]	train-rmse:2.664979
[12]	train-rmse:2.663281
[13]	train-rmse:2.661551
[14]	train-rmse:2.660488
[15]	train-rmse:2.659533
[16]	train-rmse:2.658401
[17]	train-rmse:2.657456
[18]	train-rmse:2.656397
[19]	train-rmse:2.655383
[20]	train-rmse:2.654523
[21]	train-rmse:2.653666
[22]	train-rmse:2.652954
[23]	train-rmse:2.652259
[24]	train-rmse:2.651504
[25]	train-rmse:2.650948
[26]	train-rmse:2.650256
[27]	train-rmse:2.649628
[28]	train-rmse:2.649040
[29]	train-rmse:2.647902
[30]	train-rmse:2.646841
[31]	train-rmse:2.645940
[32]	train-rmse:2.645429
[33]	train-rmse:2.644907
[34]	train-rmse:2.644366
[35]	train-rmse:2.643850
[36]	train-rmse:2.643141
[37]	train-rmse:2.642603
[38]	train-rmse:2.642021
[39]	train-rmse:2.641535
[40]	train-rmse:2.640980
[41]	trai

In [29]:
xgboost_error_model_fn = "../../data/alphas/$name/xgb_error.model"
XGBoost.save(error_bst, xgboost_error_model_fn)

In [30]:
# 2.631241

## Save params

In [31]:
write_params(
    Dict(
        "β" => β,
        "alphas" => alphas,
        "bst" => xgboost_model_fn,
        "error_bst" => xgboost_error_model_fn,
    ),
);