# Write the splits in a Julia optimized format

In [1]:
using CSV
using DataFrames
using JLD2
using JupyterFormatter
using ProgressMeter

In [2]:
enable_autoformat();

In [3]:
struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end;

In [4]:
function get_dataset(file)
    df = DataFrame(CSV.File(file))
    df.user .+= 1 # julia is 1 indexed
    df.item .+= 1
    df.rating = float(df.rating)
    RatingsDataset(df.user, df.item, df.rating)
end

get_split(split) = get_dataset("../../data/splits/$(split).csv");

## Load splits

In [5]:
@time const explicit_training = get_split("explicit_training");
@time const explicit_validation = get_split("explicit_validation");
@time const explicit_test = get_split("explicit_test");
@time const implicit_training = get_split("implicit_training");
@time const implicit_validation = get_split("implicit_validation");
@time const implicit_test = get_split("implicit_test");
@time const negative = get_split("negative");

 10.817875 seconds (31.21 M allocations: 20.731 GiB, 5.20% gc time, 67.89% compilation time)
  1.252730 seconds (12.07 k allocations: 1.911 GiB, 50.99% gc time, 1.14% compilation time)
  0.742119 seconds (3.35 k allocations: 2.372 GiB, 38.02% gc time)
  1.903209 seconds (3.42 k allocations: 7.105 GiB, 15.92% gc time)
  0.516470 seconds (3.25 k allocations: 824.421 MiB, 59.00% gc time)
  0.246884 seconds (3.15 k allocations: 741.965 MiB, 30.15% gc time)
  0.955169 seconds (3.11 k allocations: 2.491 GiB, 27.71% gc time)


In [6]:
const file = "../../data/splits/splits.jld2";
@time jldsave(
    file;
    explicit_training,
    explicit_validation,
    explicit_test,
    implicit_training,
    implicit_validation,
    implicit_test,
    negative,
);

  4.396662 seconds (1.27 M allocations: 60.982 MiB, 1.46% gc time, 21.24% compilation time)
