# Match items across sources

In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [None]:
import CSV
import DataFrames
import Glob
import JSON
import Memoize: @memoize
import ProgressMeter: @showprogress
import Random
import SHA
import StatsBase

In [None]:
itemtype = Tuple{String,String};

In [None]:
function read_csv(x::String; kw...)
    CSV.read(x, DataFrames.DataFrame; types = String, missingstring = nothing, kw...)
end;

In [None]:
function get_data_path(file)
    joinpath(@__DIR__, "../../../data/$file")
end;

In [None]:
function seed_rng!(salt::String)
    init = first(read_csv(get_data_path("rng.csv")).seed)
    seed = first(reinterpret(UInt64, SHA.sha256(init * salt)))
    Random.seed!(seed)
end;

In [None]:
@memoize function get_items(medium::String, source::String)
    itemcols = Dict(
        "mal" => "uid",
        "anilist" => "anilistid",
        "kitsu" => "kitsuid",
        "animeplanet" => "url",
    )
    files = Glob.glob(get_data_path("/$source/user_media_facts/user_$(medium)_list.*.csv"))
    items = Dict{Tuple{String,String},Int64}()
    @showprogress for f in files
        df = read_csv(f)
        pop = StatsBase.countmap(df[:, itemcols[source]])
        for (k, v) in pop
            k = (source, k)
            if k ∉ keys(items)
                items[k] = 0
            end
            items[k] += v
        end
    end

    validids = Set(
        (source, x) for
        x in read_csv(get_data_path("media/sources/$source.$medium.csv"); ntasks = 1).uid
    )
    for x in validids
        if x ∉ keys(items)
            items[x] = 0
        end
    end
    validitems = Dict{Tuple{String,String},Int64}()
    for (k, v) in items
        if k in validids
            validitems[k] = v
        else
            if v >= 100
                @info "$source $medium missing item $k with $v impressions"
            end
        end
    end
    validitems
end;

In [None]:
function get_edges(matchtype::String, medium::String, source1::String, source2::String)
    df = read_csv(get_data_path("media/$matchtype/$medium.$source1.$source2.csv"))
    key1 = source1
    key2 = source2
    if source1 == source2
        key2 *= "_1"
    end
    edges::Set{Tuple{itemtype,itemtype}} = Set()
    for (u, v) in zip(df[:, key1], df[:, key2])
        push!(edges, ((source1, u), (source2, v)))
    end
    edges
end;

In [None]:
function get_edges(
    matchtypes::Vector{String},
    medium::String,
    source1::String,
    source2::String,
)
    edges::Dict{itemtype,itemtype} = Dict()
    vertices_with_conflicts = Set{itemtype}()
    invalid_edges = get_edges("manual/invalid", medium, source1, source2)
    seen_invalid_edges = Set()
    for matchtype in matchtypes
        for (u, v) in get_edges(matchtype, medium, source1, source2)
            if (u, v) in invalid_edges
                push!(seen_invalid_edges, (u, v))
                continue
            elseif u ∉ keys(edges)
                edges[u] = v
            elseif edges[u] != v
                @warn "Edge ($u, $(edges[u])) conflicts with edge ($u, $v)"
                push!(vertices_with_conflicts, u)
            end
        end
    end
    for x in vertices_with_conflicts
        delete!(edges, x)
    end
    for x in setdiff(invalid_edges, seen_invalid_edges)
        @info "stale invalid edge $x"
    end
    seen_valid_edges = Set()
    for (u, v) in get_edges("manual/valid", medium, source1, source2)
        if u ∈ keys(edges)
            if edges[u] == v
                @info "stale valid edge $((u, v))"
            else
                @warn "Overwriting edge $((u, edges[u])) with valid edge $((u, v))"
                edges[u] = v
            end
        else
            edges[u] = v
        end
    end
    Set((u, v) for (u, v) in edges)
end;

In [None]:
function merge_items(medium::String, sources::Vector{String}, matchtypes::Vector{String})
    edges::Set{Tuple{itemtype,itemtype}} = Set()
    membership::Dict{itemtype,Vector{itemtype}} = Dict()
    for i = 1:length(sources)
        source1 = sources[i]
        edge_dicts = [
            Dict(k => v for (k, v) in get_edges(matchtypes, medium, source1, source2))
            for source2 in sources[1:i-1]
        ]
        items_sorted_by_popularity =
            sort(collect(get_items(medium, source1)), by = x -> x[2], rev = true)
        @showprogress for (k, _) in items_sorted_by_popularity
            matches = [ed[k] for ed in edge_dicts if k in keys(ed)]
            if length(matches) > 0
                # attempt to match to existing items
                merge_candidates = []
                edge_candidates = []
                for x in matches
                    if membership[x] ∉ merge_candidates
                        push!(merge_candidates, membership[x])
                        push!(edge_candidates, (k, x))
                    end
                end
                merge_sources = [Set(s for (s, _) in mc) for mc in merge_candidates]
                can_merge = sum(length.(merge_sources)) == length(union(merge_sources...))
                if can_merge
                    # merge all matching entries
                    v = [x for mc in merge_candidates for x in mc]
                    v = sort(v, by = k -> findfirst(x -> x == k[1], sources))
                    push!(v, k)
                    for x in v
                        membership[x] = v
                    end
                    for e in edge_candidates
                        push!(edges, e)
                    end
                else
                    @warn "conflicting matches $([(k, m) for m in matches]) | $(merge_candidates)"
                    membership[k] = [k]
                end
            else
                membership[k] = [k]
            end
        end
    end
    Set(values(membership)), edges
end;

In [None]:
function merge_overlapping(
    medium::String,
    sources::Vector{String},
    vertices::Set{Vector{itemtype}},
)
    membership::Dict{itemtype,Vector{itemtype}} =
        Dict(item => v for v in vertices for item in v)
    edges::Set{Tuple{itemtype,itemtype}} = Set()
    for i = 1:length(sources)
        source1 = sources[i]
        for source2 in sources[1:i]
            for matchtype in ["manual/overlapping", "manual/merge"]
                for e in get_edges(matchtype, medium, source1, source2)
                    v1 = get(membership, e[1], nothing)
                    v2 = get(membership, e[2], nothing)
                    if isnothing(v1) || isnothing(v2) || v1 == v2
                        if source1 != source2
                            @info "stale overlapping edge $e"
                        end
                    else
                        v = vcat(v2, v1)
                        v = sort(v, by = k -> findfirst(x -> x == k[1], sources))
                        for x in v
                            membership[x] = v
                        end
                        push!(edges, e)
                    end
                end
            end
        end
    end
    Set(values(membership)), edges
end;

In [None]:
@memoize function get_media(medium::String, source::String)
    df = read_csv(get_data_path("media/sources/$source.$medium.csv"); ntasks = 1)
    Dict(df[i, "uid"] => df[i:i, :] for i = 1:DataFrames.nrow(df))
end

function get_entry(medium::String, sources::Vector{String}, items::Vector{itemtype})
    source, uid = first(items)
    df = get_media(medium, source)[uid]
    df = DataFrames.select(df, DataFrames.Not([:uid]))
    for s in sources
        df[!, "users_$(s)"] = [0]
    end
    df[!, "users"] = [0]
    for x in sources
        df[!, x] = [[]]
    end
    for item in items
        source, uid = item
        first(df)["users_$(source)"] =
            max(first(df)["users_$(source)"], get_items(medium, source)[item])
        push!(first(df)[source], uid)
    end
    for x in sources
        df[!, x] = JSON.json.(df[:, x])
        df[:, "users"] += df[:, "users_$(x)"]
    end
    df
end;

In [None]:
function save_matches(medium::String)
    outdir = get_data_path("/media/match")
    if !ispath(outdir)
        mkpath(outdir)
    end
    sources = ["mal", "anilist", "kitsu", "animeplanet"]
    v, merge_edges = merge_items(medium, sources, ["malid", "manami", "metadata"])
    v, overlapping_edges = merge_overlapping(medium, sources, v)
    e = union(merge_edges, overlapping_edges)

    # save media
    rows = [get_entry(medium, sources, x) for x in v]
    media = sort(reduce(vcat, rows), :users, rev = true)
    # filter out rare items
    media = DataFrames.filter(x -> x.users >= 100, media)
    # randomly assign uids
    media = Random.shuffle(media)
    media[!, :uid] = 1:DataFrames.nrow(media)
    CSV.write("$outdir/$medium.csv", media)

    # save edges
    function num_users(medium::String, edge::Tuple{itemtype,itemtype})
        users(x) = get_items(medium, first(x))[x]
        sum(users.(edge))
    end
    edges = DataFrames.DataFrame(
        [tuple(Iterators.flatten(x)..., num_users(medium, x)) for x in e],
        [:source1, :uid1, :source2, :uid2, :users],
    )
    edges = sort(edges, :users, rev = true)
    CSV.write("$outdir/edges.$medium.csv", edges)

    # save unmerged items
    unmerged_items = DataFrames.select(
        filter(x -> isempty(JSON.parse(x[sources[1]])), media),
        [sources[2:end]; ["users"]],
    )
    CSV.write("$outdir/unmerged.$medium.csv", unmerged_items)
end;

In [None]:
seed_rng!("ImportDatasets/ImportMedia/Match")

In [None]:
save_matches("manga");

In [None]:
save_matches("anime");