# Write the splits in a Julia optimized format

In [1]:
using CSV
using DataFrames
using JLD2
using JupyterFormatter
using ProgressMeter

In [2]:
enable_autoformat();

In [3]:
struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end;

In [4]:
function get_dataset(file)
    df = DataFrame(CSV.File(file))
    df.user .+= 1 # julia is 1 indexed
    df.item .+= 1
    df.rating = float(df.rating)
    RatingsDataset(df.user, df.item, df.rating)
end

get_split(split) = get_dataset("../../data/splits/$(split).csv");

## Load splits

In [5]:
@time const explicit_training = get_split("explicit_training");
@time const explicit_validation = get_split("explicit_validation");
@time const explicit_test = get_split("explicit_test");
@time const implicit_training = get_split("implicit_training");
@time const implicit_validation = get_split("implicit_validation");
@time const implicit_test = get_split("implicit_test");
@time const negative_validation = get_split("negative_validation");
@time const negative_test = get_split("negative_test");

 11.427723 seconds (23.98 M allocations: 17.379 GiB, 3.20% gc time, 46.30% compilation time)
  1.273499 seconds (10.34 k allocations: 2.004 GiB, 20.65% gc time, 0.20% compilation time)
  0.629603 seconds (1.52 k allocations: 2.184 GiB)
  2.566363 seconds (1.53 k allocations: 7.480 GiB, 4.09% gc time)
  0.276939 seconds (1.53 k allocations: 872.323 MiB)
  0.255249 seconds (1.49 k allocations: 741.871 MiB)
  0.987508 seconds (1.48 k allocations: 2.491 GiB)
  1.781183 seconds (1.48 k allocations: 2.491 GiB, 18.60% gc time)


In [6]:
const file = "../../data/splits/splits.jld2";
@time jldsave(
    file;
    explicit_training,
    explicit_validation,
    explicit_test,
    implicit_training,
    implicit_validation,
    implicit_test,
    negative_validation,
    negative_test,
);

  2.199464 seconds (1.28 M allocations: 61.063 MiB, 34.02% compilation time)
