# Write the splits in a Julia optimized format

In [1]:
using CSV
using DataFrames
using JLD2
using JupyterFormatter

In [2]:
enable_autoformat();

In [3]:
struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end;

In [4]:
function get_dataset(file)
    df = DataFrame(CSV.File(file))
    df.username .+= 1 # julia is 1 indexed
    df.anime_id .+= 1
    df.my_score = float(df.my_score)
    RatingsDataset(df.username, df.anime_id, df.my_score)
end

get_split(split) = get_dataset("../../data/splits/$(split).csv");

In [5]:
@time training = get_split("training");
@time validation = get_split("validation");
@time test = get_split("test");
@time implicit = get_dataset("../../data/processed_data/user_implicit_lists.csv")

 35.472678 seconds (8.47 M allocations: 13.357 GiB, 2.25% gc time, 0.32% compilation time)
  1.077775 seconds (4.87 k allocations: 685.108 MiB, 73.49% gc time)
  0.198039 seconds (2.53 k allocations: 684.781 MiB)
 28.555790 seconds (2.41 k allocations: 14.748 GiB, 1.41% gc time)


RatingsDataset(Int32[851625, 851625, 851625, 851625, 851625, 851625, 851625, 851625, 851625, 851625  …  369188, 369188, 369188, 369188, 369188, 369188, 369188, 369188, 369188, 369188], Int32[11528, 498, 805, 41, 12807, 7670, 101, 3963, 116, 6464  …  14194, 3, 6082, 774, 14254, 7169, 15584, 9017, 14423, 369], Float32[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

In [6]:
file = "../../data/splits/splits.jld2";
@time jldsave(file; training, validation, test, implicit);

 40.513498 seconds (5.29 M allocations: 305.290 MiB, 0.19% gc time, 4.96% compilation time)
