In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [None]:
import CSV
import DataFrames
import JSON
import Memoize: @memoize
import ProgressMeter: @showprogress

In [None]:
itemtype = Tuple{String,String} # (medium, uid)
relationtype = String;

In [None]:
@memoize function get_media(medium::String, source::String)::Dict{String,String}
    df = CSV.read(
        "../../../data/media/match/$medium.csv",
        DataFrames.DataFrame;
        ntasks = 1,
        types = String,
        missingstring = nothing,
    )
    ret::Dict{String,String} = Dict()
    for (items, uid) in zip(df[:, source], df.uid)
        for item in JSON.parse(items)
            @assert item ∉ keys(ret)
            ret[item] = uid
        end
    end
    ret
end

function get_key(medium::String, source::String, id::String)::Union{itemtype,Nothing}
    media = get_media(medium, source)
    if id ∉ keys(media)
        return nothing
    end
    (medium, media[id])
end;

In [None]:
@memoize function get_media_sources(medium::String, sources)::Dict{String,Set{String}}
    df = CSV.read(
        "../../../data/media/match/$medium.csv",
        DataFrames.DataFrame;
        ntasks = 1,
        types = String,
        missingstring = nothing,
    )
    ret::Dict{String,Set{String}} = Dict()
    for i = 1:DataFrames.nrow(df)
        uid = df[i, :uid]
        ret[uid] = Set{String}()
        for s in sources
            if !isempty(JSON.parse(df[i, s]))
                push!(ret[uid], s)
            end
        end
    end
    ret
end

function get_sources(x::itemtype, sources)::Set{String}
    get_media_sources(x[1], sources)[x[2]]
end;

In [None]:
@memoize function conflicts(x::relationtype, y::relationtype)::Bool
    conflict_sets = Set([
        Set(["prequel", "sequel"]),
        Set(["parent_story", "side_story"]),
        Set(["summary", "full_story"]),
    ])
    Set([x, y]) in conflict_sets
end

@memoize function compose(x::relationtype, y::relationtype)::relationtype
    relation_priority = (
        "adaptation",
        "prequel",
        "sequel",
        "parent_story",
        "side_story",
        "alternative_version",
        "alternative_setting",
        "summary",
        "full_story",
        "spin_off",
        "character",
        "other",
    )
    for r in relation_priority
        if r == x
            return x
        elseif r == y
            return y
        end
    end
    @assert false
end;

In [None]:
@memoize function relation_overrides(
    medium::String,
    source::String,
)::Dict{Tuple{itemtype,itemtype},relationtype}
    ret = Dict(
        "anime" => Dict(
            "mal" => Dict(
                (("anime", "48220"), ("anime", "59141")) => "sequel",
                (("anime", "56129"), ("anime", "56132")) => "sequel",
            ),
            "anilist" => Dict(
                (("anime", "20596"), ("anime", "20837")) => "sequel",
                (("anime", "20837"), ("anime", "20596")) => "prequel",
            ),
            "kitsu" => Dict(
                (("anime", "41153"), ("anime", "41154")) => "sequel",
                (("anime", "13436"), ("anime", "12540")) => "prequel",
            ),
            "animeplanet" => Dict(),
        ),
        "manga" => Dict(
            "mal" => Dict(
                (("manga", "1075"), ("manga", "121026")) => "sequel",
                (("manga", "84"), ("manga", "18983")) => "sequel",
                (("manga", "18983"), ("manga", "84")) => "prequel",
            ),
            "anilist" => Dict(
                (("manga", "99035"), ("manga", "67563")) => "parent_story",
                (("manga", "87027"), ("manga", "103634")) => "sequel",
                (("manga", "30070"), ("manga", "177816")) => "sequel",
                (("manga", "33450"), ("manga", "158670")) => "sequel",
                (("manga", "158670"), ("manga", "33450")) => "prequel",
            ),
            "kitsu" => Dict(),
            "animeplanet" => Dict(),
        ),
    )
    ret[medium][source]
end;

In [None]:
function load_relations(
    medium::String,
    source::String,
)::Dict{Tuple{itemtype,itemtype},relationtype}
    relations::Dict{Tuple{itemtype,itemtype},relationtype} = Dict()
    df = CSV.read(
        "../../../data/media/sources/$source.$(medium)_relations.csv",
        DataFrames.DataFrame;
        ntasks = 1,
        types = String,
        missingstring = nothing,
    )
    overrides = relation_overrides(medium, source)
    seen_overrides::Set{Tuple{itemtype,itemtype}} = Set()
    for i = 1:DataFrames.nrow(df)
        s = get_key(df.source_media[i], source, df.source_id[i])
        t = get_key(df.target_media[i], source, df.target_id[i])
        r = df.relation[i]
        if df.source_media[i] != df.target_media[i]
            r = "adaptation"
        end
        if s == t
            continue
        end
        if isnothing(s) || isnothing(t)
            continue
        end
        k = (s, t)
        source_key =
            ((df.source_media[i], df.source_id[i]), (df.target_media[i], df.target_id[i]))
        if k in keys(relations)
            if conflicts(relations[k], r)
                if source_key in keys(overrides)
                    push!(seen_overrides, source_key)
                    relations[k] = overrides[source_key]
                else
                    @warn "conflicting $source relation $source_key $((relations[k], r))"
                end
            else
                relations[k] = compose(relations[k], r)
            end
        else
            relations[k] = r
        end
    end
    for override in setdiff(keys(overrides), seen_overrides)
        @info "stale $medium $source relation override $override"
    end
    relations
end;

In [None]:
function save_relations(medium::String)
    relations::Dict{Tuple{itemtype,itemtype},relationtype} = Dict()
    sources = ("mal", "anilist", "kitsu", "animeplanet")
    for i = 1:length(sources)
        s = sources[i]
        seen_sources = Set([sources[j] for j = 1:i-1])
        for (k, v) in load_relations(medium, s)
            if k in keys(relations)
                continue
            end
            if !isempty(
                intersect(
                    seen_sources,
                    get_sources(k[1], sources),
                    get_sources(k[2], sources),
                ),
            )
                continue
            end
            # only add relations for new items that haven't been seen yet
            relations[k] = v
        end
    end
    df = DataFrames.DataFrame(
        [tuple(Iterators.flatten(k)..., v) for (k, v) in relations],
        [:source_media, :source_id, :target_media, :target_id, :relation],
    )
    outdir = "../../../data/media/relations"
    if !ispath(outdir)
        mkpath(outdir)
    end
    CSV.write("$outdir/$medium.relations.csv", sort(df))
end;

In [None]:
save_relations("anime");

In [None]:
save_relations("manga");