# Write the splits in a Julia optimized format

In [1]:
using CSV
using DataFrames
using JLD2
using JupyterFormatter
using ProgressMeter

In [2]:
enable_autoformat();

In [3]:
struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end;

In [4]:
function get_dataset(file)
    df = DataFrame(CSV.File(file))
    df.user .+= 1 # julia is 1 indexed
    df.item .+= 1
    df.rating = float(df.rating)
    RatingsDataset(df.user, df.item, df.rating)
end

get_split(split) = get_dataset("../../data/splits/$(split).csv");

## Load splits

In [5]:
@time explicit_training = get_split("explicit_training");
@time explicit_validation = get_split("explicit_validation");
@time explicit_test = get_split("explicit_test");
@time implicit_training = get_split("implicit_training");
@time implicit_validation = get_split("implicit_validation");
@time implicit_test = get_split("implicit_test");
@time negative = get_split("negative");

 39.840059 seconds (7.51 M allocations: 10.858 GiB, 1.23% gc time, 39.78% compilation time)
  2.071833 seconds (11.42 k allocations: 1.188 GiB, 42.58% gc time, 0.23% compilation time)
  1.711438 seconds (837 allocations: 1.298 GiB, 22.53% gc time)
  4.544684 seconds (834 allocations: 4.019 GiB, 15.77% gc time)
  0.694552 seconds (833 allocations: 468.120 MiB, 32.61% gc time)
  0.683812 seconds (835 allocations: 509.450 MiB, 34.20% gc time)
  1.932571 seconds (834 allocations: 1.886 GiB, 6.93% gc time)


In [6]:
file = "../../data/splits/splits.jld2";
@time jldsave(
    file;
    explicit_training,
    explicit_validation,
    explicit_test,
    implicit_training,
    implicit_validation,
    implicit_test,
    negative,
);

  5.174518 seconds (6.14 M allocations: 324.249 MiB, 1.91% gc time, 60.71% compilation time)
