# Generalized Neural Network
* A denoising autoencoder that learns the user's ratings and implicit ratings

In [1]:
name = "GNN";
residual_alphas = ["UserItemBiases"];

In [2]:
using Flux
import BSON

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
BLAS.set_num_threads(Threads.nthreads());

In [5]:
device = gpu;

## Data preparation

In [6]:
const training = get_residuals("training", residual_alphas);
const validation = get_residuals("validation", residual_alphas)
# column accesses are faster than row accesses, so we make this an (item, user) matrix 
const R = sparse(
    training.item,
    training.user,
    convert.(Float32, training.rating),
    maximum(training.item) + 1,  # leave room for unseen users and items
    maximum(training.user) + 1,
);
const n_items, n_users = size(R);

In [7]:
counts = zeros(n_users, Threads.nthreads())
@tprogress Threads.@threads for u in training.user
    counts[u, Threads.threadid()] += 1
end
counts = sum(counts, dims = 2);

[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 0.33 μs/it)[39m


In [8]:
function get_data(R, split, j, train)
    # inputs are the user's ratings (unseen shows get mapped to zero) + implicit ratings + heterogenous features
    # outputs are the user's ratings + implicit ratings during training
    # outputs are the user's rating + implicit rating for a held out item on their list during inference

    # handle users and items that aren't in the training set
    u = min(split.user[j], n_users)
    i = min(split.item[j], n_items)

    # ratings
    X1 = collect(R[:, u])
    X1[i] = 0
    # implicit ratings
    X2 = copy(X1)
    X2[X2.!=0] .= 1
    # heterogeneous features
    count = convert(Float32, max(counts[u] - 1, 0) / n_items) # TODO benchmark use of global
    X3 = [count, sqrt(count), count^2]
    X = vcat(X1, X2, X3)

    # outputs
    Y1 = zeros(eltype(X1), length(X1))
    Y2 = zeros(eltype(X2), length(X2))
    if train
        mask = X2 .!= 0
        Y1[mask] .= X1[mask]
        Y2[mask] .= X2[mask]
    else
        Y1[i] = split.rating[j]
        Y2[i] = 1
    end

    (X, Y1, Y2)
end

function get_batch(R, split, block_size, train)
    items = rand(1:length(split.rating), block_size)
    data = [[] for j = 1:Threads.nthreads()]
    Threads.@threads for i = 1:length(items)
        push!(data[Threads.threadid()], get_data(R, split, items[i], train))
    end
    X = Flux.batch([data[t][i][1] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    Y1 = Flux.batch([data[t][i][2] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    Y2 = Flux.batch([data[t][i][3] for t = 1:Threads.nthreads() for i = 1:length(data[t])])
    [(X, (Y1, Y2))] |> device
end;

In [9]:
# custom split layer
struct Split{T}
    paths::T
end

Split(paths...) = Split(paths)

Flux.@functor Split

(m::Split)(x::AbstractArray) = map(f -> f(x), m.paths)

In [10]:
# inputs are the user's ratings for all shows (unseen shows get mapped to zero) + implicit ratings + heterogenous features
# outputs are the user's ratings for all shows (unseen shows get mapped to zero), implicit ratings
# we will train ratings using mse on observed shows, and implicit ratings via crossentropy loss
n_items = size(R)[1]
encoder = Chain(
    Dense(n_items + n_items + 3, 512, relu),
    Dense(512, 256, relu),
    Dense(256, 128, relu),
)
rating_decoder = Chain(Dense(128, 256, relu), Dense(256, 512, relu), Dense(512, n_items))
implicit_decoder = Chain(Dense(128, 256, relu), Dense(256, 512, relu), Dense(512, n_items))
m = Chain(Dropout(0.5), encoder, Split(rating_decoder, implicit_decoder)) |> device
ps = Flux.params(m);
m

Chain(
  Dropout(0.5),
  Chain(
    Dense(33965, 512, relu),            [90m# 17_390_592 parameters[39m
    Dense(512, 256, relu),              [90m# 131_328 parameters[39m
    Dense(256, 128, relu),              [90m# 32_896 parameters[39m
  ),
  Split(
    Tuple(
      Chain(
        Dense(128, 256, relu),          [90m# 33_024 parameters[39m
        Dense(256, 512, relu),          [90m# 131_584 parameters[39m
        Dense(512, 16981),              [90m# 8_711_253 parameters[39m
      ),
      Chain(
        Dense(128, 256, relu),          [90m# 33_024 parameters[39m
        Dense(256, 512, relu),          [90m# 131_584 parameters[39m
        Dense(512, 16981),              [90m# 8_711_253 parameters[39m
      ),
    ),
  ),
)[90m                   # Total: 18 arrays, [39m35_306_538 parameters, 134.685 MiB.

## Training

In [11]:
function rating_loss(ŷ, y)
    mask = y .!= 0
    Flux.mse(ŷ[mask], y[mask])
end

implicit_loss(ŷ, y) = Flux.logitcrossentropy(ŷ, y)

function implicit_loss(ŷ, y, mask)
    ŷ[mask] .= -1e3
    implicit_loss(ŷ, y)
end

function loss_components(x, y, train)
    ŷ = m(x)
    if train
        return (rating_loss(ŷ[1], y[1]), implicit_loss(ŷ[2], y[2]))
    else
        mask = (x.!=0)[1:n_items, :]
        return (rating_loss(ŷ[1], y[1]), implicit_loss(ŷ[2], y[2], mask))
    end
end;

In [12]:
opt = ADAM();

In [13]:
best_loss = Inf
patience = 10
iters_without_improvement = 0
continue_training = true
iters = 0


function evalcb(R, split, train)
    losses = []
    @showprogress for epoch = 1:100
        push!(losses, loss_components(get_batch(R, split, 128, train)[1]..., train))
    end
    reduce(.+, losses) ./ length(losses)
end


const training_baseline_loss = evalcb(R, training, true)
const inference_baseline_loss = evalcb(R, training, false);
function loss(x, y)
    sum(loss_components(x, y, true) ./ training_baseline_loss)
end

function evalcb()
    # print losses and perform early stopping
    testmode!(m)
    @debug "iteration: $iters"
    training_losses = evalcb(R, training, true)
    training_loss = sum(training_losses ./ training_baseline_loss)
    @debug "training losses: $(training_losses) -> $(training_loss)"
    inference_losses = evalcb(R, validation, false)
    loss = sum(inference_losses ./ inference_baseline_loss)
    @debug "validation losses: $(inference_losses) -> $(loss)"
    if loss < best_loss
        global best_loss = loss
        global iters_without_improvement = 0
        BSON.@save "../../data/alphas/$name/model.bson" m
    else
        global iters_without_improvement += 1
        if iters_without_improvement >= patience
            global continue_training = false
        end
    end
    trainmode!(m)
end

throttled_cb = Flux.throttle(evalcb, 30);

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:20[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:08[39m


In [None]:
while continue_training
    batch = get_batch(R, training, 128, true)
    Flux.train!(loss, ps, batch, opt, cb = throttled_cb)
    iters += 1
end

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:21:28 iteration: 0
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:09[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:21:38 training losses: (1.6617624f0, 4147.5684f0) -> 2.0450315
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:09[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:21:48 validation losses: (1.685081f0, 9.700835f0) -> 1.9845676
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:22:33 iteration: 28
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:07[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:22:41 training losses: (1.5716187f0, 3470.235f0) -> 1.8218625
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:10[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220128 05:22:52 validation losses: (1.6143215f0, 7.84

## Write predictions

In [None]:
# function get_data(R, u)
#     X = collect(R[:, u])
#     Xr = copy(X)
#     Xr[Xr.!=0] .= 1
#     X = vcat(X, Xr)

#     # add heterogeneous features
#     weight = sum(Xr .!= 0)
#     nitems_feature = weight / size(R)[1]
#     push!(X, nitems_feature)
#     push!(X, sqrt(nitems_feature))
#     push!(X, nitems_feature^2)
#     X
# end;

In [None]:
# function model(users, items)
#     df = RatingsDataset(users, items, fill(0, length(users)))
#     predictions = zeros(length(users))
#     deduped_users = collect(Set(users))
#     @tprogress Threads.@threads for i = 1:length(deduped_users)
#         u = deduped_users[i]
#         alpha = m(get_data(R, u))
#         for j = 1:length(users)
#             if users[j] == u
#                 predictions[j] = alpha[items[j]]
#             end
#         end
#     end
#     predictions
# end;

In [None]:
# BSON.@load "../../data/alphas/$name/model.bson" m

In [None]:
# write_predictions(model)