In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [None]:
import CSV
import DataFrames
import Dates
import JSON
import Glob
import ProgressMeter: @showprogress, next!

In [None]:
function read_csv(x; kw...)
    CSV.read(x, DataFrames.DataFrame; types = String, missingstring = nothing, kw...)
end;

In [None]:
function get_data_path(file)
    path = pwd()
    while basename(path) != "notebooks"
        path = dirname(path)
    end
    path = dirname(path)
    joinpath(path, "data", file)
end;

In [None]:
const src = get_data_path("media")
const dst = get_data_path("processed_data")
const SOURCES = ["mal", "anilist", "kitsu", "animeplanet"]
if !ispath(dst)
    mkpath(dst)
end;

In [None]:
# save media
for m in ["manga", "anime"]
    cp("$src/match/$m.csv", "$dst/$m.csv", force = true)
    cp("$src/relations/$m.relations.csv", "$dst/$m.relations.csv", force = true)
    for s in SOURCES
        cp("$src/sources/$s.$m.csv", "$dst/$s.$m.csv", force = true)
    end
end;

In [None]:
# save timestamps
function parse_timestamp(x)
    # there can be negative timestamps if users manually input a bogus date
    if isempty(x) || startswith(x, "-")
        return 0
    end
    parse(Int, x)
end

maxts = -Inf
mints = Inf
for s in SOURCES
    files = Glob.glob("$s/user_media_facts/user_?????_list.*.csv", get_data_path(""))
    @showprogress for f in files
        df = read_csv(f)
        ts = (df.updated_at .|> parse_timestamp) |> y -> filter(x -> x != 0, y)
        maxts = ts |> y -> maximum(y; init = maxts)
        mints = ts |> y -> minimum(y; init = mints)

    end
end
@assert maxts > 0 && maxts <= Dates.datetime2unix(Dates.now()) maxts
@assert mints != Inf && mints >= Dates.datetime2unix(Dates.DateTime(2002, 1, 1)) mints
@assert mints <= maxts (mints, maxts)
open("$dst/timestamps.csv", "w") do f
    write(f, "min_ts,max_ts\n")
    write(f, "$mints,$maxts\n")
end;

In [None]:
# save userids
function get_userid_map()
    uid = 1
    user_maps = Dict{String,Dict{String,Int}}()
    for s in SOURCES
        user_maps[s] = Dict{String,Int}()
        files = Glob.glob("$s/user_media_facts/user_status.*.csv", get_data_path(""))
        for f in sort(files)
            for username in read_csv(f).username
                if username ∉ keys(user_maps[s])
                    user_maps[s][username] = uid
                    uid += 1
                else
                    @warn "duplicate username $username"
                    user_maps[s][username] = 0
                end
            end
        end
    end
    userids = []
    for (s, v) in user_maps
        for (username, userid) in v
            push!(userids, (s, username, userid))
        end
    end
    DataFrames.DataFrame(userids, [:source, :username, :userid])
end

CSV.write("$dst/userid_map.csv", get_userid_map());

In [None]:
function archive_training_data()
    path = get_data_path("raw_training_data")
    if !ispath(path)
        mkpath(path)
    end
    for s in SOURCES
        mv(get_data_path(s), "$path/$s")
    end
end

archive_training_data();