# Generalized Neural Network
* A denoising autoencoder that learns the user's ratings and implicit ratings

In [None]:
using Random
import BSON

In [None]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [None]:
device = gpu;

In [None]:
Random.seed!(20220313 * hash(name));

## Data preparation

In [None]:
const training = get_residuals("training", residual_alphas)
const validation = get_residuals("validation", residual_alphas)
const implicit_training = get_split("implicit_training")
const n_items = num_items() + 1 # leave room to map unseen items
const n_users = maximum(training.user) + 1; # leave room to map unseen users

In [None]:
# column accesses are faster than row accesses, so we make this an (item, user) matrix 
const R = sparse(training.item, training.user, training.rating, n_items, n_users)
const Ri = sparse(
    implicit_training.item,
    implicit_training.user,
    implicit_training.rating,
    n_items,
    n_users,
);

In [None]:
const dropout_perc = 0.5

In [None]:
function get_rating_sum(split)
    counts = zeros(Float32, n_users, Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(split.rating)
        counts[split.user[i], Threads.threadid()] += split.rating[i]
    end
    counts = sum(counts, dims = 2)
    counts
end;

In [None]:
const implicit_counts = get_rating_sum(implicit_training)
const rating_counts = get_rating_sum(
    RatingsDataset(
        training.user,
        training.item,
        fill(one(eltype(training.rating)), length(training.rating)),
    ),
)
const rating_sum = get_rating_sum(get_split("training"))

In [None]:
function get_data(split, j, train)
    # inputs are the user's ratings (unseen shows get mapped to zero) + implicit ratings + heterogenous features
    #     we randomly hold out one item from the input and apply dropout to the rest
    # during training, outputs are the user's ratings
    # during inference, outputs are the user's rating for the held out item

    # handle users and items that aren't in the training set
    u = min(split.user[j], n_users)
    i = min(split.item[j], n_items)
    held_out = R[i, u] != 0

    # ratings
    X1 = collect(R[:, u])
    X1[i] = 0
    # implicit ratings
    X2 = collect(Ri[:, u])
    X2[i] = 0
    # heterogeneous features
    features = [
        max(implicit_counts[u] - held_out, 0) / n_items, # fraction of items seen
        max(rating_counts[u] - held_out, 0) / n_items,  # fraction of items rated
        (rating_sum[u] - R[i, u]) / max(rating_counts[u] - held_out, 1) / 10, # average rating
    ]
    features = convert.(Float32, features)
    X3 = vcat(features, features .^ 2, sqrt.(features))

    X = vcat(X1, X2, X3)

    # outputs
    Y = zeros(eltype(X1), length(X1))
    if train
        mask = X2 .!= 0
        if train_implicit_model
            Y[mask] .= 1
        else
            Y[mask] .= X1[mask]
        end
    else
        if train_implicit_model
            Y[i] = 1
        else
            Y[i] = split.rating[j]
        end
    end

    (X, Y)
end

function get_batch(split, block_size, train)
    idxs = rand(1:length(split.rating), block_size)
    data = [[] for j = 1:Threads.nthreads()]
    Threads.@threads for i = 1:length(idxs)
        push!(data[Threads.threadid()], get_data(split, idxs[i], train))
    end
    X = Flux.batch([data[t][i][1] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    Y = Flux.batch([data[t][i][2] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    [(X, Y)] |> device
end;

In [None]:
function generate_model()
    # inputs are the user's ratings for all shows (unseen shows get mapped to zero) + implicit ratings + heterogenous features
    # outputs are the user's ratings for all shows (unseen shows get mapped to zero), implicit ratings
    # we will train ratings using mse on observed shows, and implicit ratings via crossentropy loss
    encoder = Chain(
        Dense(n_items + n_items + 9, 512, relu),
        Dense(512, 256, relu),
        Dense(256, 128, relu),
    )
    decoder =
        Chain(Dense(128, 256, relu), Dense(256, 512, relu), Dense(512, n_items))
    m = Chain(Dropout(dropout_perc), encoder, decoder) |> device
    m
end;

## Training

In [None]:
function rating_loss(ŷ, y)
    # only compute loss on items the user has seen
    mask = y .!= 0
    Flux.mse(ŷ[mask], y[mask])
end

implicit_loss(ŷ, y) = Flux.logitcrossentropy(ŷ, y)

function loss_components(m, x, y)
    ŷ = m(x)
    if train_implicit_model
        return implicit_loss(ŷ, y)
    else
        return rating_loss(ŷ, y)
    end
end;

In [None]:
function reset_training()
    global best_loss = Inf
    global patience = 30
    global smoothing = 0.9
    global smoothing_loss = Inf
    global iters_without_improvement = 0
    global min_improvement = 1e-4
    global continue_training = true
    global iters = 0
end;

In [None]:
function train_model(model_name, seed)
    Random.seed!(seed)
    m = generate_model()
    ps = Flux.params(m)
    reset_training()
    BLAS.set_num_threads(Threads.nthreads())

    # Setup early stopping callbacks
    function evalcb(split, train, epochs)
        losses = []
        @showprogress for epoch = 1:epochs
            push!(losses, loss_components(m, get_batch(split, 128, train)[1]...))
        end
        reduce(.+, losses) ./ length(losses)
    end

    function evalcb()
        # print losses and perform early stopping
        testmode!(m)
        @debug "iteration: $iters"
        training_losses = evalcb(training, true, 100) 
        training_loss = sum(training_losses ./ training_baseline_loss)
        @debug "training losses: $(training_losses) -> $(training_loss)"
        inference_losses = evalcb(validation, false, 500)
        inference_loss = sum(inference_losses ./ inference_baseline_loss)
        if smoothing_loss == Inf
            global smoothing_loss = inference_loss
        else
            global smoothing_loss = smoothing * smoothing_loss + (1 - smoothing) * inference_loss
        end
        inference_loss = smoothing_loss
        @debug "validation losses: $(inference_losses) -> $(inference_loss)"
        if inference_loss + min_improvement < best_loss
            global best_loss = inference_loss
            global iters_without_improvement = 0
            BSON.@save "../../data/alphas/$name/model.$(model_name).bson" m
        else
            global iters_without_improvement += 1
            if iters_without_improvement >= patience
                global continue_training = false
            end
        end
        trainmode!(m)
    end

    # Setup loss
    training_baseline_loss = evalcb(training, true, 1000)
    inference_baseline_loss = evalcb(validation, false, 1000)  
    throttled_cb = Flux.throttle(evalcb, 600)
    opt = ADAM(0.001, (0.9, 0.999), 1e-5)

    function loss(x, y)
        loss_components(m, x, y) / training_baseline_loss
    end

    # Train model
    while continue_training
        batch = get_batch(training, 128, true)
        Flux.train!(loss, ps, batch, opt, cb = throttled_cb)
        global iters += 1
    end

    Dict(
        "name" => "$name.$model_name",
        "loss" => best_loss,
        "patience" => patience,
        "iters" => iters,
        "model" => "../../data/alphas/$name/model.$(model_name).bson",
        "residual_alphas" => residual_alphas,
        "seed" => seed,
    )
end;

## Write predictions

In [None]:
function get_data(u)
    # ratings
    X1 = collect(R[:, u])
    # implicit ratings
    X2 = collect(Ri[:, u])
    # heterogeneous features
    features = [
        implicit_counts[u] / n_items, # fraction of items seen
        rating_counts[u] / n_items,  # fraction of items rated
        rating_sum[u] / max(rating_counts[u], 1) / 10, # average rating
    ]
    features = convert.(Float32, features)
    X3 = vcat(features, features .^ 2, sqrt.(features))
    vcat(X1, X2, X3)
end

function get_batch(users)
    data = [[] for j = 1:Threads.nthreads()]
    Threads.@threads for i = 1:length(users)
        push!(data[Threads.threadid()], get_data(users[i]))
    end
    X = Flux.batch([data[t][i] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    X |> device
end;

In [None]:
function gmodel(m, users, items)
    # index users
    user_to_output_idxs = [Dict() for t = 1:Threads.nthreads()]
    @tprogress Threads.@threads for j = 1:length(users)
        u = users[j]
        t = Threads.threadid()
        if u ∉ keys(user_to_output_idxs[t])
            user_to_output_idxs[t][u] = []
        end
        push!(user_to_output_idxs[t][u], j)
    end
    user_to_output_idxs = merge(vcat, user_to_output_idxs...)

    # allocate outputs
    ratings = zeros(Float32, length(users))

    # split users into mini-batches
    deduped_users = collect(Set(users))
    batch(arr, n) = [arr[i:min(i + n - 1, end)] for i = 1:n:length(arr)]
    batches = batch(deduped_users, 128)

    # compute predictions
    @tprogress Threads.@threads for i = 1:length(batches)
        b = batches[i]
        user_to_input_idx = Dict(zip(b, 1:length(b)))
        alpha = m(get_batch(b)) |> cpu
        if train_implicit_model
            alpha .= exp.(alpha)
            alpha .= alpha ./ sum(alpha, dims = 1)
        end

        for u in b
            input_idx = user_to_input_idx[u]
            for output_idx in user_to_output_idxs[u]
                ratings[output_idx] = alpha[items[output_idx], input_idx]
            end
        end
    end
    ratings
end;

In [None]:
function make_prediction(sparse_preds, users, items)
    preds = zeros(length(users))
    @tprogress Threads.@threads for j = 1:length(preds)
        preds[j] = sparse_preds[users[j], items[j]]
    end
    preds
end;

In [None]:
function save_model(params)
    BSON.@load params["model"] m
    testmode!(m)
    BLAS.set_num_threads(1) # gmodel already multithreads

    full_df = reduce(cat, [training, validation, get_residuals("test", residual_alphas)])
    ratings = gmodel(m, full_df.user, full_df.item)
    sparse_preds = sparse(full_df.user, full_df.item, ratings)

    write_params(params, outdir = params["name"])
    write_predictions(
        (users, items) -> make_prediction(sparse_preds, users, items),
        residual_alphas = residual_alphas,
        outdir = params["name"],
        implicit = train_implicit_model,
    )
end;

In [None]:
function fit(num_seeds, start=1)
    seeds = hash.(rand(Int, num_seeds))
    for i in start:length(seeds)
        save_model(train_model(i, seeds[i]))
    end;
end