# Generalized Neural Network
* A denoising autoencoder that learns the user's ratings and implicit ratings

In [1]:
# TODO deduplicate code

In [2]:
const name = "GNN.Resid";
const residual_alphas = [
    "UserItemBiases"
]
const residual_alphas2 = [
    # Main signals
    ["UserItemBiases"]                                 
    ["GNN.$K" for K = 1:1]                             
    ["MatrixFactorization.$K" for K in [10, 20, 40]] 
    ["ItemCF.$K" for K in [2^4, 2^6, 2^8, 2^10, 2^12]]
    ["ItemCF.Resid.$alpha.1.256" for alpha in ["GNN", "ItemCF", "MF"]] 
]

13-element Vector{String}:
 "UserItemBiases"
 "GNN.1"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCF.4096"
 "ItemCF.Resid.GNN.1.256"
 "ItemCF.Resid.ItemCF.1.256"
 "ItemCF.Resid.MF.1.256"

In [3]:
using Random
import BSON

In [4]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [5]:
device = gpu;

In [6]:
Random.seed!(20220313 * hash(name));

## Data preparation

In [7]:
const training = get_residuals("training", residual_alphas)
const validation = get_residuals("validation", residual_alphas)
const implicit_training = get_split("implicit_training")
const n_items = num_items() + 1 # leave room to map unseen items
const n_users = maximum(training.user) + 1; # leave room to map unseen users

In [8]:
const training2 = get_residuals("training", residual_alphas2)
const validation2 = get_residuals("validation", residual_alphas2)
function get_other(split)
    if split == training
        return training2
    elseif split == validation
        return validation2
    end
    @assert false
end
const R2 = sparse(training2.item, training2.user, training2.rating, n_items, n_users)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:54[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:06[39m


18953×1320150 SparseMatrixCSC{Float32, Int32} with 152803783 stored entries:
⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛

In [9]:
# column accesses are faster than row accesses, so we make this an (item, user) matrix 
const R = sparse(training.item, training.user, training.rating, n_items, n_users)
const Ri = sparse(
    implicit_training.item,
    implicit_training.user,
    implicit_training.rating,
    n_items,
    n_users,
);

In [10]:
function get_rating_sum(split)
    counts = zeros(Float32, n_users, Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(split.rating)
        counts[split.user[i], Threads.threadid()] += split.rating[i]
    end
    counts = sum(counts, dims = 2)
    counts
end;

In [11]:
const implicit_counts = get_rating_sum(implicit_training)
const rating_counts = get_rating_sum(
    RatingsDataset(
        training.user,
        training.item,
        fill(one(eltype(training.rating)), length(training.rating)),
    ),
)
const rating_sum = get_rating_sum(get_split("training"))
const rating_max = maximum(get_split("training").rating);

[32mProgress: 100%|███████████████████████████| Time: 0:00:06 ( 0.48 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 0.46 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:04 ( 0.49 μs/it)[39m


In [12]:
function get_data(split, j, train)
    # inputs are the user's ratings (unseen shows get mapped to zero) + implicit ratings + heterogenous features
    # during training, outputs are the user's ratings + implicit ratings 
    # during inference, outputs are the user's rating + implicit rating for a held out item on their list

    # handle users and items that aren't in the training set
    u = min(split.user[j], n_users)
    i = min(split.item[j], n_items)

    # ratings
    X1 = collect(R[:, u])
    X1[i] = 0
    # implicit ratings
    X2 = collect(Ri[:, u])
    X2[i] = 0
    # heterogeneous features
    features = [
        max(implicit_counts[u] - 1, 0) / n_items, # fraction of items seen
        max(rating_counts[u] - 1, 0) / n_items,  # fraction of items rated
        (rating_sum[u] - R[i, u]) / max(rating_counts[u] - 1, 1) / rating_max, # average rating
    ]
    features = convert.(Float32, features)
    X3 = vcat(features, features .^ 2, sqrt.(features))

    X = vcat(X1, X2, X3)

    # outputs
    Y = zeros(eltype(X1), length(X1))
    if train
        mask = X2 .!= 0
        Y[mask] .= collect(R2[:, u])[mask]
    else
        Y[i] = get_other(split).rating[j]
    end

    (X, Y)
end

function get_batch(split, block_size, train)
    idxs = rand(1:length(split.rating), block_size)
    data = [[] for j = 1:Threads.nthreads()]
    Threads.@threads for i = 1:length(idxs)
        push!(data[Threads.threadid()], get_data(split, idxs[i], train))
    end
    X = Flux.batch([data[t][i][1] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    Y = Flux.batch([data[t][i][2] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    [(X, Y)] |> device
end;

In [13]:
function generate_model()
    # inputs are the user's ratings for all shows (unseen shows get mapped to zero) + implicit ratings + heterogenous features
    # outputs are the user's ratings for all shows (unseen shows get mapped to zero), implicit ratings
    # we will train ratings using mse on observed shows, and implicit ratings via crossentropy loss
    encoder = Chain(
        Dense(n_items + n_items + 9, 512, relu),
        Dense(512, 256, relu),
        Dense(256, 128, relu),
    )
    rating_decoder =
        Chain(Dense(128, 256, relu), Dense(256, 512, relu), Dense(512, n_items))
    m = Chain(Dropout(0.5), encoder, rating_decoder) |> device
    m
end;

## Training

In [14]:
function rating_loss(ŷ, y)
    # only compute loss on items the user has seen
    mask = y .!= 0
    Flux.mse(ŷ[mask], y[mask])
end


function loss_components(m, x, y)
    ŷ = m(x)
    rating_loss(ŷ, y)
end;

In [15]:
function reset_training()
    global best_loss = Inf
    global patience = 30
    global smoothing = 0.9
    global iters_without_improvement = 0
    global min_improvement = 1e-4
    global continue_training = true
    global iters = 0
end;

In [16]:
function train_model(model_name, seed)
    Random.seed!(seed)
    m = generate_model()
    ps = Flux.params(m)
    reset_training()
    BLAS.set_num_threads(Threads.nthreads())

    # Setup early stopping callbacks
    function evalcb(split, train, epochs)
        losses = []
        @showprogress for epoch = 1:epochs
            push!(losses, loss_components(m, get_batch(split, 128, train)[1]...))
        end
        reduce(.+, losses) ./ length(losses)
    end

    function evalcb()
        # print losses and perform early stopping
        testmode!(m)
        @debug "iteration: $iters"
        training_losses = evalcb(training, true, 100)
        training_loss = sum(training_losses ./ training_baseline_loss)
        @debug "training losses: $(training_losses) -> $(training_loss)"
        inference_losses = evalcb(validation, false, 500)
        inference_loss = sum(inference_losses ./ inference_baseline_loss)
        if best_loss != Inf
            inference_loss = smoothing * best_loss + (1 - smoothing) * inference_loss
        end
        @debug "validation losses: $(inference_losses) -> $(inference_loss)"
        if inference_loss + min_improvement < best_loss
            global best_loss = inference_loss
            global iters_without_improvement = 0
            BSON.@save "../../data/alphas/$name/model.$(model_name).bson" m
        else
            global iters_without_improvement += 1
            if iters_without_improvement >= patience
                global continue_training = false
            end
        end
        trainmode!(m)
    end

    # Setup loss
    training_baseline_loss = mean(training2.rating .^ 2)
    inference_baseline_loss = mean(validation2.rating .^ 2)
    throttled_cb = Flux.throttle(evalcb, 600)
    opt = ADAM(0.001, (0.9, 0.999), 1e-5)

    function loss(x, y)
        sum(loss_components(m, x, y) ./ training_baseline_loss)
    end

    # Train model
    while continue_training
        batch = get_batch(training, 128, true)
        Flux.train!(loss, ps, batch, opt, cb = throttled_cb)
        global iters += 1
    end

    Dict(
        "name" => "$name.$model_name",
        "loss" => best_loss,
        "patience" => patience,
        "iters" => iters,
        "model" => "../../data/alphas/$name/model.$(model_name).bson",
        "residual_alphas" => residual_alphas,
        "seed" => seed,
    )
end;

## Write predictions

In [17]:
function get_data(u)
    # ratings
    X1 = collect(R[:, u])
    # implicit ratings
    X2 = collect(Ri[:, u])
    # heterogeneous features
    features = [
        implicit_counts[u] / n_items, # fraction of items seen
        rating_counts[u] / n_items,  # fraction of items rated
        rating_sum[u] / max(rating_counts[u], 1) / rating_max, # average rating
    ]
    features = convert.(Float32, features)
    X3 = vcat(features, features .^ 2, sqrt.(features))
    vcat(X1, X2, X3)
end

function get_batch(users)
    data = [[] for j = 1:Threads.nthreads()]
    Threads.@threads for i = 1:length(users)
        push!(data[Threads.threadid()], get_data(users[i]))
    end
    X = Flux.batch([data[t][i] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    X |> device
end;

In [18]:
function gmodel(m, users, items)
    # index users
    user_to_output_idxs = [Dict() for t = 1:Threads.nthreads()]
    @tprogress Threads.@threads for j = 1:length(users)
        u = users[j]
        t = Threads.threadid()
        if u ∉ keys(user_to_output_idxs[t])
            user_to_output_idxs[t][u] = []
        end
        push!(user_to_output_idxs[t][u], j)
    end
    user_to_output_idxs = merge(vcat, user_to_output_idxs...)

    # allocate outputs
    ratings = zeros(Float32, length(users))

    # split users into mini-batches
    deduped_users = collect(Set(users))
    batch(arr, n) = [arr[i:min(i + n - 1, end)] for i = 1:n:length(arr)]
    batches = batch(deduped_users, 128)

    # compute predictions
    @tprogress Threads.@threads for i = 1:length(batches)
        b = batches[i]
        user_to_input_idx = Dict(zip(b, 1:length(b)))
        alpha = m(get_batch(b)) |> cpu

        for u in b
            input_idx = user_to_input_idx[u]
            for output_idx in user_to_output_idxs[u]
                ratings[output_idx] = alpha[items[output_idx], input_idx]
            end
        end
    end
    ratings, implicit
end;

In [19]:
function make_prediction(sparse_preds, users, items)
    preds = zeros(length(users))
    @tprogress Threads.@threads for j = 1:length(preds)
        preds[j] = sparse_preds[users[j], items[j]]
    end
    preds
end;

In [20]:
function save_model(params)
    BSON.@load params["model"] m
    testmode!(m)
    BLAS.set_num_threads(1) # gmodel already multithreads

    full_df = reduce(cat, [training, validation, get_residuals("test", residual_alphas)])
    ratings, implicit_ratings = gmodel(m, full_df.user, full_df.item)
    sparse_preds = sparse(full_df.user, full_df.item, ratings)
    implicit_preds = sparse(full_df.user, full_df.item, implicit_ratings)

    write_params(params, outdir = params["name"])
    write_predictions(
        (users, items) -> make_prediction(sparse_preds, users, items),
        residual_alphas = residual_alphas,
        outdir = params["name"],
    )
    write_predictions(
        (users, items) -> make_prediction(implicit_preds, users, items),
        residual_alphas = [],
        outdir = params["name"] * ".Implicit",
        implicit = true,
    )
end;

In [None]:
seeds = hash.(rand(Int, 1))
for i in 1:length(seeds)
    save_model(train_model(i, seeds[i]))
end;

[38;5;6m[1m┌ [22m[39m[38;5;6m[1mInfo: [22m[39m20220320 02:43:38 The GPU function is being called but the GPU is not accessible. 
[38;5;6m[1m└ [22m[39mDefaulting back to the CPU. (No action is required if you want to run on the CPU).
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220320 02:44:27 iteration: 0
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:07[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220320 02:44:36 training losses: 0.803414 -> 0.84810424
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:27[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220320 02:45:03 validation losses: 1.2708716 -> 1.0039388
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220320 02:55:15 iteration: 607
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:07[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220320 02:55:22 training losses: 0.7805604 -> 0.82