# Pretraining
* Trains a bag-of-words model on user data

In [None]:
metric = ""
medium = "";

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

In [None]:
import HDF5
import JSON
import SparseArrays: AbstractSparseArray, sparse

In [None]:
name = "$medium/BagOfWords/v1/$metric"
set_logging_outdir(name);

# Data

In [None]:
@memoize function get_rating_beta(medium, alpha)
    metric = "rating"
    regress(get_features("validation", metric, medium, [alpha])..., metric)
end;

In [None]:
function rating_inputs(medium::String)
    metric = "rating"
    alpha = "$medium/rating/Baseline"
    β = get_rating_beta(medium, alpha)
    df = get_split("training", metric, medium, [:userid, :itemid, :metric], alpha)
    df.metric .= df.metric - df.alpha .* β
    sparse(df, medium)
end;

function watch_inputs(medium::String)
    df = get_split("training", "watch", medium, [:userid, :itemid, :metric])
    sparse(df, medium)
end;

function get_epoch_inputs()
    @info "loading inputs"
    inputs = [rating_inputs.(ALL_MEDIUMS); watch_inputs.(ALL_MEDIUMS)]
    vcat(inputs...)
end;

In [None]:
function get_epoch_labels(split, metric, medium)
    @info "loading labels"
    if metric == "rating"
        alpha = "$medium/rating/Baseline"
        df = get_split(split, metric, medium, [:userid, :itemid, :metric], alpha)
        df.metric .= df.metric - df.alpha .* get_rating_beta(medium, alpha)
        return sparse(df, medium)
    else
        df = get_split(split, metric, medium, [:userid, :itemid, :metric])
        return sparse(df, medium)
    end
end;

In [None]:
function get_epoch_weights(
    split::String,
    metric::String,
    medium::String,
    λ_wu::Real,
    λ_wa::Real,
    λ_wt::Real,
)
    @info "loading weights"
    GC.gc()
    if split == "training"
        df = get_split(split, metric, medium, [:userid, :itemid, :updated_at])
        weights = df.updated_at
        @showprogress for i = 1:length(weights)
            weights[i] = λ_wt^((1 - df.updated_at[i]) / days_in_timestamp_units(365))
        end
        for (c, λ) in zip([:userid, :itemid], [λ_wu, λ_wa])
            w = get_counts(getfield(df, c))
            @showprogress for i = 1:length(weights)
                weights[i] *= powerdecay(w[i], λ)
            end
        end
    else
        df = get_split(split, metric, medium, [:userid, :itemid])
        weights = powerdecay(get_counts(df.userid), -1.0f0)
    end
    df = @set df.metric = weights
    GC.gc()
    sparse(df, medium)
end;

# Disk I/O

In [None]:
function create_training_config(medium, metric)
    Dict(
        # model
        "input_sizes" => num_items.(ALL_MEDIUMS),
        "output_size_index" => findfirst(x -> x == medium, ALL_MEDIUMS),
        "metric" => metric,
        # training
        "user_weight_decay" => 0.0f0,
        "item_weight_decay" => 0.0f0,
        "temporal_weight_decay" => 0.5f0,
        "mask_rate" => 0.25,
        # data
        "num_shards" => 8,
    )
end;

In [None]:
function setup_training(config, outdir)
    if !isdir(outdir)
        mkpath(outdir)
    end
    for x in readdir(outdir, join = true)
        if isfile(x)
            rm(x)
        end
    end
end;

In [None]:
function save_features(X, Y, W, epoch_size, users, valid_users, filename)
    d = Dict{String,Any}()
    data = [X, Y, W]
    names = ["inputs", "labels", "weights"]
    for i = 1:length(names)
        record_sparse_array!(d, names[i], data[i])
    end
    d["epoch_size"] = epoch_size
    d["users"] = users
    d["valid_users"] = valid_users
    HDF5.h5open(filename, "w") do file
        for (k, v) in d
            file[k] = v
        end
    end
end

function record_sparse_array!(d::Dict, name::String, x::AbstractSparseArray)
    i, j, v = SparseArrays.findnz(x)
    d[name*"_i"] = i
    d[name*"_j"] = j
    d[name*"_v"] = v
    d[name*"_size"] = [size(x)[1], num_users()]
end;

# Run

In [None]:
function save_split!(split, X, config)
    @info "loading $split data"
    GC.gc()
    outdir = get_data_path(joinpath("alphas", name, split))
    setup_training(config, outdir)
    if split == "inference"
        Y = sparse(RatingsDataset(), medium) # unused
        W = sparse(RatingsDataset(), medium) # unused
        users = collect(0:num_users()-1)
        valid_users = Set{Int32}()
        for s in ["test", "negative"]
            for m in ALL_METRICS
                df = get_raw_split(s, medium, [:userid], nothing)
                valid_users = union(valid_users, Set(df.userid))
            end
        end
        valid_users = sort(collect(valid_users))
        save_features(X, Y, W, length(valid_users), users, valid_users, "$outdir/data.1.h5")
    else
        Y = get_epoch_labels(split, metric, medium)
        W = get_epoch_weights(
            split,
            metric,
            medium,
            config["user_weight_decay"],
            config["item_weight_decay"],
            config["temporal_weight_decay"],
        )
        num_shards = config["num_shards"]
        users = collect(0:num_users()-1)
        valid_users = users[vec(sum(W, dims = 1) .> 0)]
        epoch_size = length(valid_users)
        config["epoch_size_$(split)"] = epoch_size
        chunks = collect(
            Iterators.partition(1:num_users(), div(num_users(), num_shards, RoundUp)),
        )
        @showprogress for i = 1:length(chunks)
            save_features(
                X[:, chunks[i]],
                Y[:, chunks[i]],
                W[:, chunks[i]],
                epoch_size,
                users[chunks[i]],
                valid_users,
                "$outdir/data.$i.h5",
            )
        end
    end
end

function save_splits(config)
    X = get_epoch_inputs()
    for split in ["training", "validation", "test", "inference"]
        save_split!(split, X, config)
    end

    fn = get_data_path(joinpath("alphas", name, "config.json"))
    open(fn, "w") do f
        write(f, JSON.json(config))
    end
end;

In [None]:
const config = create_training_config(medium, metric);

In [None]:
save_splits(config);

In [None]:
GC.gc()

In [None]:
@info config

In [None]:
for mode in ["pretrain", "finetune", "inference"]
    run(`python3 Pytorch.py --outdir $name --mode $mode`)
end

In [None]:
for split in ["training", "validation", "test", "inference"]
    outdir = get_data_path(joinpath("alphas", name, split))
    rm(outdir, recursive = true)
end

# Save

In [None]:
file = HDF5.h5open(get_data_path(joinpath("alphas", name, "predictions.h5")), "r")
predictions = read(file["predictions"])
users = read(file["users"])
close(file)

In [None]:
user_to_index = Dict()
for i = 1:length(users)
    user_to_index[users[i]] = i
end

In [None]:
function model(users, items, predictions, user_to_index)
    p = zeros(Float32, length(users))
    @showprogress for i = 1:length(p)
        @assert users[i] in keys(user_to_index)
        u = user_to_index[users[i]]
        a = items[i] + 1
        p[i] = predictions[a, u]
    end
    p
end;

In [None]:
write_alpha(
    (users, items) -> model(users, items, predictions, user_to_index),
    medium,
    name,
    ["test", "negative"],
)

In [None]:
for split in ["test"]
    if metric == "rating"
        alphas = ["$medium/rating/Baseline", name]
    else
        alphas = [name]
    end
    val = compute_loss(metric, medium, alphas, split)
    @info "$split loss = $val"
end