# Simple User Item Biases
* Computes a bias for each user and for each item
* Prediction for user $i$ and item $j$ is $\tilde r_{ij} = m + u_i + a_j$
* $m = \text{mean}_{ij}(r_{ij})$
* $u_i = \text{mean}_j(r_{ij}) - m$
* $a_j = \text{mean}_i(r_{ij}) - m$
* $r_{ij}$ is the rating for user $i$ and item $j$
* Useful as a preprocessing step for other alphas

In [1]:
using CSV
using DataFrames
using DataStructures
using FileIO
using JLD2
using JupyterFormatter
using Statistics
import Metrics

In [2]:
enable_autoformat();

In [3]:
function get_split(split)
    @assert split in ["training", "validation"]
    file = "../../data/splits/$(split).csv"
    df = DataFrame(CSV.File(file))
    df.username .+= 1 # julia is 1 indexed
    df.anime_id .+= 1
    df.my_score = float(df.my_score)
    return df
end;

In [4]:
function write_prediction(df, split)
    @assert split in ["validation"]
    outdir = "../../data/alphas/$name"
    if !isdir(outdir)
        mkpath(outdir)
    end
    df = copy(df)
    df.username .-= 1
    df.anime_id .-= 1
    CSV.write("$(outdir)/$(split).csv", df)
end;

In [5]:
function write_model(params)
    outdir = "../../data/alphas/$name"
    if !isdir(outdir)
        mkpath(outdir)
    end
    save("$(outdir)/model.jld2", params)
end;

In [6]:
function evaluate(truth, pred)
    print("RMSE ", sqrt(Metrics.mse(pred, truth)))
    print(" MAE ", Metrics.mae(pred, truth))
    print(" R2 ", Metrics.r2_score(pred, truth))
end;

In [7]:
name = "SimpleUserItemBiases";

In [8]:
training = get_split("training");

In [9]:
validation = get_split("validation");

## Training

In [10]:
μ = mean(training.my_score);

In [11]:
u = combine(groupby(training, :username), :my_score => mean => :my_score)
u = DefaultDict(0.0, Dict(Pair.(u.username, u.my_score .- μ)));

In [12]:
a = combine(groupby(training, :anime_id), :my_score => mean => :my_score)
a = DefaultDict(0.0, Dict(Pair.(a.anime_id, a.my_score .- μ)));

In [14]:
function make_prediction(users, items, μ, u, a)
    r = zeros(eltype(μ), length(users))
    for i = 1:length(r)
        r[i] = u[users[i]] + a[items[i]] + μ
    end
    return r
end;

model(users, items) = make_prediction(users, items, μ, u, a);

## Inference

In [15]:
training_pred_score = model(training.username, training.anime_id);
evaluate(training.my_score, training_pred_score);

RMSE 1.3329115279899955 MAE 0.9997747594835902 R2 0.4395862105349557

In [16]:
val_pred_score = model(validation.username, validation.anime_id);
evaluate(validation.my_score, val_pred_score);

RMSE 1.342890864774748 MAE 1.0067577667022556 R2 0.4312189466062175

In [17]:
# write predictions to disk
val_pred = copy(validation);
val_pred.my_score = val_pred_score;
write_prediction(val_pred, "validation");

In [18]:
# write model to disk
write_model(Dict("μ" => μ, "u" => u, "a" => a, "model" => model));