# Match items using media metadata

In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [None]:
import CSV
import DataFrames
import Dates
import JSON
import Memoize: @memoize
import ProgressMeter: @showprogress
import Random
import StringDistances

In [None]:
struct TitleType
    title::String
    alttitle::String
end

struct DateType
    date::Dates.DateTime
    resolution::Int64
end

struct LengthType
    length::Int64
    plus::Bool
    given::Bool
end

struct DurationType
    duration::Float64
    given::Bool
end

struct Media
    uid::Vector{String}
    title::Vector{TitleType}
    summary::Vector{String}
    mediatype::Vector{String}
    startdate::Vector{DateType}
    enddate::Vector{DateType}
    episodes::Vector{LengthType}
    duration::Vector{DurationType}
    chapters::Vector{LengthType}
    volumes::Vector{LengthType}
    status::Vector{String}
    season::Vector{String}
    studios::Vector{Set{String}}
    genres::Vector{String}
    accessed_at::Vector{String}
end;

@memoize function get_media(source::String, medium::String)
    function parse_title(x::String, y::String)::TitleType
        return TitleType(lowercase(x), lowercase(y))
    end

    function parse_date(x::String)::DateType
        if isempty(x)
            return DateType(Dates.DateTime(1), 0)
        end
        r = parse.(Int64, split(x, "-"))
        while length(r) > 0
            try
                Dates.DateTime(r...)
                break
            catch e
                r = r[1:end-1]
            end
        end
        DateType(Dates.DateTime(r...), length(r))
    end

    function parse_episodes(x::String)::LengthType
        if isempty(x)
            return LengthType(0, false, false)
        end
        plus = false
        if endswith(x, "+")
            x = x[1:end-1]
            plus = true
        end
        LengthType(parse(Int64, x), plus, true)
    end

    function parse_duration(x::String)::DurationType
        if isempty(x)
            return DurationType(0, false)
        end
        DurationType(parse(Float64, x), true)
    end

    parse_studios(x::String)::Set{String} = Set(lowercase(x) for x in JSON.parse(x))

    df = CSV.read(
        "../../../data/media/sources/$source.$medium.csv",
        DataFrames.DataFrame;
        ntasks = 1,
        types = String,
        missingstring = nothing,
    )
    Media(
        df.uid,
        parse_title.(df.title, df.alttitle),
        df.summary,
        df.mediatype,
        parse_date.(df.startdate),
        parse_date.(df.enddate),
        parse_episodes.(df.episodes),
        parse_duration.(df.duration),
        parse_episodes.(df.chapters),
        parse_episodes.(df.volumes),
        df.status,
        df.season,
        parse_studios.(df.studios),
        df.genres,
        df.accessed_at,
    )
end;

In [None]:
@memoize function get_mediatypes(medium)
    if medium == "manga"
        manga_types = Set(["Manhwa", "Manhua", "Manga", "OEL"])
        novel_types = Set(["Light Novel", "Novel"])
        shortmanga_types = Set(["One-shot", "Doujinshi"])
        return (manga_types, novel_types, shortmanga_types)
    elseif medium == "anime"
        tv_types = Set(["ONA", "TV"])
        shortanime_types = Set(["Music", "CM", "PV", "Special"])
        special_types = Set(["OVA", "Special", "TV Special"])
        movie_types = Set(["Movie"])
        return (tv_types, shortanime_types, special_types, movie_types)
    else
        @assert false
    end
end

function match_mediatype(medium::String, t1::String, t2::String, fuzzy::Bool)
    if isempty(t1) || isempty(t2)
        return 0
    end
    if fuzzy
        for types in get_mediatypes(medium)
            if t1 in types && t2 in types
                return 1
            end
        end
        return -1
    else
        return t1 == t2
    end
end

function match_date(d1::DateType, d2::DateType, fuzzy::Bool)
    if d1.resolution == 0 || d2.resolution == 0
        return 0
    end
    if fuzzy
        if abs(d1.date - d2.date) <= Dates.Day(31)
            return 1
        end
    end
    N = min(d1.resolution, d2.resolution)
    fns = (Dates.year, Dates.month, Dates.day)
    for i = 1:N
        if fns[i](d1.date) != fns[i](d2.date)
            return fuzzy ? -1 : 0
        end
    end
    1
end

function match_season(s1::String, s2::String, fuzzy::Bool)
    if isempty(s1) || isempty(s2)
        return 0
    end
    if s1 == s2
        return 1
    else
        return fuzzy ? -1 : 0
    end
end

@memoize function get_statustypes()
    released = Set(["Finished", "Releasing", "Cancelled", "On Hiatus"])
    unreleased = Set(["TBA", "Upcoming"])
    (released, unreleased)
end

function match_status(s1::String, s2::String, fuzzy::Bool)
    if isempty(s1) || isempty(s2)
        return 0
    end
    if fuzzy
        # an item can transition from upcoming -> releasing
        if Set((s1, s2)) == Set(("Upcoming", "Releasing"))
            return 0
        end
        for types in get_statustypes()
            if s1 in types && s2 in types
                return 1
            end
        end
        return -1
    else
        return s1 == s2
    end
end

function match_episodes(e1::LengthType, e2::LengthType, fuzzy::Bool)
    if !e1.given || !e2.given
        return 0
    end
    if fuzzy
        n1 = e1.length
        n2 = e2.length
        if abs(n1 - n2) <= 1
            return 1
        elseif min(n1 / n2, n2 / n1) >= 0.8
            return 1
        elseif e1.plus || e2.plus
            return 0
        else
            return -1
        end
    else
        return e1.length == e2.length
    end
end

function match_duration(d1::DurationType, d2::DurationType, fuzzy::Bool)
    if !d1.given || !d2.given
        return 0
    end
    if fuzzy
        n1 = d1.duration
        n2 = d2.duration
        if abs(n1 - n2) <= 3
            return 1
        elseif min(n1 / n2, n2 / n1) >= 0.8
            return 1
        else
            return -1
        end
    else
        return d1 == d2
    end
end

function match_studios(s1::Set{String}, s2::Set{String}, fuzzy::Bool)
    if length(s1) == 0 || length(s2) == 0
        return 0
    end
    if fuzzy
        return !isdisjoint(s1, s2)
    else
        return s1 == s2
    end
end

function matchstring(x::String, y::String, fuzzy::Bool)
    if isempty(x) || isempty(y)
        return 0
    end
    if fuzzy
        cutoff = 0.9
        match = StringDistances.compare(x, y, StringDistances.Levenshtein()) > cutoff
        return match ? 1 : -1
    else
        return x == y
    end
end

# macro for the following operation
# if expr == errcode
#     return errcode
# else
#     accum += expr
# end
macro earlyreturn(errcode, accum, expr)
    esc(quote
        _r = $(expr)
        if _r == $(errcode)
            return $(errcode)
        else
            $(accum) += _r
        end
    end)
end

function match_titles(t1::TitleType, t2::TitleType, fuzzy::Bool)
    n = 0
    @earlyreturn 1 n matchstring(t1.title, t2.title, fuzzy)
    @earlyreturn 1 n matchstring(t1.title, t2.alttitle, fuzzy)
    @earlyreturn 1 n matchstring(t1.alttitle, t2.title, fuzzy)
    @earlyreturn 1 n matchstring(t1.alttitle, t2.alttitle, fuzzy)
    return fuzzy ? -1 : 0
end

function fuzzy(fn::Function, args...)
    n = 0
    @earlyreturn -1 n fn(args..., true)
    @earlyreturn -1 n fn(args..., false)
    n
end;

In [None]:
function match_rows(medium, df1, i, df2, j)
    n = 0
    @earlyreturn -1 n fuzzy(match_mediatype, medium, df1.mediatype[i], df2.mediatype[j])
    @earlyreturn -1 n fuzzy(match_date, df1.startdate[i], df2.startdate[j])
    @earlyreturn -1 n fuzzy(match_date, df1.enddate[i], df2.enddate[j])
    @earlyreturn -1 n fuzzy(match_season, df1.season[i], df2.season[j])
    @earlyreturn -1 n fuzzy(match_status, df1.status[i], df2.status[j])
    @earlyreturn -1 n fuzzy(match_episodes, df1.episodes[i], df2.episodes[j])
    @earlyreturn -1 n fuzzy(match_episodes, df1.chapters[i], df2.chapters[j])
    @earlyreturn -1 n fuzzy(match_episodes, df1.volumes[i], df2.volumes[j])
    @earlyreturn -1 n fuzzy(match_duration, df1.duration[i], df2.duration[j])
    @earlyreturn -1 n fuzzy(match_studios, df1.studios[i], df2.studios[j])
    @earlyreturn -1 n fuzzy(match_titles, df1.title[i], df2.title[j])
    n
end

function match_metadata(source1, source2, medium, idxs, showprogress)
    media1 = get_media(source1, medium)
    media2 = get_media(source2, medium)
    matches = Dict{String,String}()
    @showprogress enabled = showprogress for i in idxs
        candidate = nothing
        max_matches = 0
        for j = 1:length(media2.uid)
            nmatches = match_rows(medium, media1, i, media2, j)
            if nmatches == max_matches
                candidate = nothing
            elseif nmatches > max_matches
                candidate = j
                max_matches = nmatches
            end
        end
        if !isnothing(candidate)
            matches[media1.uid[i]] = media2.uid[candidate]
        end
    end
    matches
end

function match_metadata(source1, source2, medium)
    nchunks = Threads.nthreads()
    idxs = Random.shuffle(1:length(get_media(source1, medium).uid))
    chunks = Iterators.partition(idxs, div(length(idxs), nchunks))
    tasks = map(Iterators.enumerate(chunks)) do (i, chunk)
        Threads.@spawn match_metadata(source1, source2, medium, chunk, i == 1)
    end
    matches = fetch.(tasks)
    reduce(merge, matches)
end;

In [None]:
outdir = "../../../data/media/metadata"
mkpath(outdir)
sources = ["mal", "anilist", "kitsu", "animeplanet"]
for medium in ["manga", "anime"]
    for i = 1:length(sources)
        for j = i+1:length(sources)
            matches = match_metadata(sources[j], sources[i], medium)
            open("$outdir/$medium.$(sources[j]).$(sources[i]).csv", "w") do f
                write(f, "$(sources[j]),$(sources[i])\n")
                for (k, v) in matches
                    write(f, "$k,$v\n")
                end
            end
        end
    end
end