# Related Series
* Constructs graphs of structurally related series (sequels, prequels, etc.)

In [None]:
import NBInclude: @nbinclude
@nbinclude("../TrainingAlpha.ipynb");

In [None]:
version = ""
dataset = ""
medium = ""
metric = ""

In [None]:
import SparseArrays;

In [None]:
function get_relations(source_medium, target_medium, relations)
    df = read_csv(get_data_path("processed_data/$source_medium.relations.csv"))
    df = filter(
        x ->
            x.source_media == source_medium &&
                x.target_media == target_medium &&
                x.relation ∈ relations,
        df,
    )
    SparseArrays.sparse(
        parse.(Int32, df.source_id),
        parse.(Int32, df.target_id),
        fill(1.0f0, length(df.source_id)),
        num_items(source_medium),
        num_items(target_medium),
    )
end;

In [None]:
function transitive_closure(S)
    closure = convert.(Bool, S)
    @showprogress for _ = 1:first(size(closure))
        new_closure = closure .| ((closure * closure) .> 0)
        if new_closure == closure
            break
        end
        closure = new_closure
    end
    convert.(eltype(S), closure)
end;

In [None]:
function get_matrix(medium, relations; symmetric = false, transitive = false)
    S = get_relations(medium, medium, relations)
    if symmetric
        S = max.(S, S')
    end
    if transitive
        S = transitive_closure(S)
    end
    for i = 1:first(size(S))
        S[i, i] = 0
    end
    SparseArrays.dropzeros!(S)
    S
end;

In [None]:
@memoize function get_user_histories(dataset, medium, metric)
    df = as_metric(
        get_split(
            dataset,
            "train",
            medium,
            [:userid, :itemid, :status, :updated_at, :update_order],
        ),
        metric,
    )

    T = Threads.nthreads()
    N = length(df.userid)
    users_t = Vector{Vector{Vector{Int32}}}(undef, T)
    batches = collect(Iterators.partition(1:N, Int(ceil(N / T))))
    Threads.@threads for b = 1:length(batches)
        users = [Int32[] for _ = 1:Threads.nthreads()]
        for i in batches[b]
            k = (df.userid[i] % Threads.nthreads()) + 1
            push!(users[k], i)
        end
        users_t[b] = users
    end
    partitions = [vcat([u[k] for u in users_t]...) for k = 1:length(batches)]

    histories = [Vector{Int32}[] for _ = 1:length(partitions)]
    Threads.@threads for t = 1:length(partitions)
        userid = nothing
        seen = Int32[]
        p = partitions[t]
        order = sortperm(collect(zip(df.userid[p], df.updated_at[p], df.update_order[p])))
        for o in order
            i = p[o]
            if userid != df.userid[i]
                push!(histories[t], seen)
                userid = df.userid[i]
                seen = Int32[]
            end
            push!(seen, df.itemid[i])
        end
        push!(histories[t], seen)
    end
    vcat(histories...)
end;

@memoize function index_histories(histories, medium)
    item_to_histories = Dict(a => Int64[] for a = 1:num_items(medium))
    @showprogress for i = 1:length(histories)
        for a in histories[i]
            push!(item_to_histories[a], i)
        end
    end
    Dict(k => Set(v) for (k, v) in item_to_histories)
end;

function is_watched_after(dataset, medium, metric, cutoff, a1, a2)
    histories = get_user_histories(dataset, medium, metric)
    item_to_histories = index_histories(histories, medium)
    idxs = collect(intersect(item_to_histories[a1], item_to_histories[a2]))
    if isempty(idxs)
        return false
    end
    counts = fill(false, length(idxs))
    Threads.@threads for i = 1:length(idxs)
        for a in histories[idxs[i]]
            if a == a2
                counts[i] = true
                break
            elseif a == a1
                break
            end
        end
    end
    sum(counts) / length(idxs) > cutoff
end;

In [None]:
@memoize function get_popularity(dataset, medium, metric)
    StatsBase.countmap(
        as_metric(get_split(dataset, "train", medium, [:itemid, :status]), metric).itemid,
    )
end

function is_more_popular(dataset, medium, metric, a1, a2)
    p = get_popularity(dataset, medium, metric)
    get(p, a1, 0) > get(p, a2, 0)
end;

In [None]:
@memoize function get_dates(medium)
    df = read_csv(get_data_path("processed_data/$medium.csv"))
    dates = Dict()
    for (u, d) in zip(df.uid, df.startdate)
        if isempty(d)
            continue
        end
        dates[parse(Int32, u)] = Dates.DateTime(d, "yyyy-mm-dd")
    end
    dates
end

function is_released_after(medium, a1, a2)
    dates = get_dates(medium)
    if a1 ∉ keys(dates) || a2 ∉ keys(dates)
        return false
    end
    dates[a1] > dates[a2]
end;

In [None]:
function save_dependencies(version, dataset, medium, metric)
    # M[i, j] = 1 if i should be watched before j
    relations = Set(["sequel", "prequel", "parent_story", "side_story"])
    M = get_matrix(medium, relations; symmetric = true)
    @showprogress for (a1, a2, _) in collect(zip(SparseArrays.findnz(M)...))
        dependency = (
            is_more_popular(dataset, medium, metric, a1, a2) &&
            !is_released_after(medium, a1, a2) &&
            !is_watched_after(dataset, medium, metric, 0.6, a1, a2)
        )
        if !dependency
            M[a1, a2] = 0
        end
    end
    SparseArrays.dropzeros!(M)
    write_params(
        Dict("S" => M),
        "nondirectional/$version/$dataset/$medium/$metric/dependencies",
    )
end;

In [None]:
function save_related(version, dataset, medium, metric)
    # M[i, j] = 1 if i and j are in the same franchise
    relations = Set([
        "sequel",
        "prequel",
        "parent_story",
        "side_story",
        "alternative_version",
        "summary",
        "full_story",
        "adaptation",
        "alternative_setting",
    ])
    M = get_matrix(medium, relations; symmetric = true, transitive = true)
    write_params(Dict("S" => M), "nondirectional/$version/$dataset/$medium/$metric/related")
end;

In [None]:
function save_recaps(version, dataset, medium, metric)
    # M[i, j] = 1 if i and j are in the same franchise
    relations = Set(["alternative_version", "summary", "full_story", "adaptation"])
    M = get_matrix(medium, relations; symmetric = true)
    write_params(Dict("S" => M), "nondirectional/$version/$dataset/$medium/$metric/recaps")
end;

In [None]:
function save_adaptations(version, dataset, medium, metric)
    # M[i, j] = 1 if i is an adaptation of j
    cross_medium = Dict("anime" => "manga", "manga" => "anime")
    M = get_relations(cross_medium[medium], medium, Set(["adaptation"]))
    write_params(
        Dict("S" => M),
        "nondirectional/$version/$dataset/$medium/$metric/adaptations",
    )
end;

In [None]:
save_dependencies(version, dataset, medium, metric);

In [None]:
save_related(version, dataset, medium, metric);

In [None]:
save_recaps(version, dataset, medium, metric);

In [None]:
save_adaptations(version, dataset, medium, metric);