# Write the splits in a Julia optimized format

In [1]:
using CSV
using DataFrames
using JLD2
using JupyterFormatter

In [2]:
enable_autoformat();

In [3]:
struct RatingsDataset
    user::Vector{Int64}
    item::Vector{Int64}
    rating::Vector{Float64}
end;

In [4]:
function get_split(split)
    @assert split in ["training", "validation", "test"]
    file = "../../data/splits/$(split).csv"
    df = DataFrame(CSV.File(file))
    df.username .+= 1 # julia is 1 indexed
    df.anime_id .+= 1
    df.my_score = float(df.my_score)
    return RatingsDataset(df.username, df.anime_id, df.my_score)
end;

In [5]:
@time training = get_split("training");
@time validation = get_split("validation");
@time test = get_split("test");

 40.754407 seconds (7.65 M allocations: 5.333 GiB, 0.68% gc time, 2.32% compilation time)
  0.930786 seconds (4.86 k allocations: 630.620 MiB, 53.78% gc time, 1.15% compilation time)
  0.363461 seconds (2.38 k allocations: 458.945 MiB)


In [6]:
file = "../../data/splits/splits.jld2";
@time jldsave(file; training, validation, test);

 26.746952 seconds (5.22 M allocations: 301.512 MiB, 14.85% compilation time)
