In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [None]:
import CSV
import DataFrames
import Dates
import JSON
import Glob
import ProgressMeter: @showprogress, next!

In [None]:
function read_csv(x; kw...)
    CSV.read(x, DataFrames.DataFrame; types = String, missingstring = nothing, kw...)
end;

In [None]:
function get_data_path(file)
    joinpath(@__DIR__, "../../../data/$file");
end;

In [None]:
const SRC = get_data_path("media")
const DST = get_data_path("processed_data")
const SOURCES = ["mal", "anilist", "kitsu", "animeplanet"]
if !ispath(DST)
    mkpath(DST)
end;

In [None]:
function save_media()
    for m in ["manga", "anime"]
        cp("$SRC/match/$m.csv", "$DST/$m.csv", force = true)
        cp("$SRC/relations/$m.relations.csv", "$DST/$m.relations.csv", force = true)
        for s in SOURCES
            cp("$SRC/sources/$s.$m.csv", "$DST/$s.$m.csv", force = true)
        end
    end
end;

save_media();

In [None]:
function save_timestamps()
    files = vcat(
        [
            Glob.glob("$s/user_media_facts/user_status.*.csv", get_data_path("")) for
            s in SOURCES
        ]...,
    )
    max_valid_ts =
        maximum([maximum(parse.(Int64, read_csv(f).access_timestamp)) for f in files])
    min_valid_ts = convert(Int64, Dates.datetime2unix(Dates.DateTime(2002, 1, 1)))
    @assert min_valid_ts <= max_valid_ts
    @assert max_valid_ts <=  Dates.datetime2unix(Dates.now())
    open("$DST/timestamps.csv", "w") do f
        write(f, "min_ts,max_ts\n")
        write(f, "$min_valid_ts,$max_valid_ts\n")
    end
end

save_timestamps();

In [None]:
function get_userid_map()
    uid = 1
    user_maps = Dict{String,Dict{String,Int}}()
    for s in SOURCES
        user_maps[s] = Dict{String,Int}()
        files = Glob.glob("$s/user_media_facts/user_status.*.csv", get_data_path(""))
        for f in sort(files)
            for username in read_csv(f).username
                if username ∉ keys(user_maps[s])
                    user_maps[s][username] = uid
                    uid += 1
                else
                    @warn "duplicate username $username"
                    user_maps[s][username] = 0
                end
            end
        end
    end
    userids = []
    for (s, v) in user_maps
        for (username, userid) in v
            push!(userids, (s, username, userid))
        end
    end
    DataFrames.DataFrame(userids, [:source, :username, :userid])
end

CSV.write("$DST/userid_map.csv", get_userid_map());

In [None]:
function archive_training_data()
    path = get_data_path("raw_training_data")
    if !ispath(path)
        mkpath(path)
    end
    for s in SOURCES
        mv(get_data_path(s), "$path/$s")
    end
    streaming_path = get_data_path("raw_streaming_data")
    if !ispath(streaming_path)
        cp(path, streaming_path)
    end
end

archive_training_data();