# Related Series
* Constructs a graph of structurally related series (sequels, prequels, etc.)

In [None]:
medium = ""

In [None]:
using DataFrames
using LinearAlgebra
using SparseArrays
import CSV
import DataFrames: DataFrame
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

## Compute similarity matrices

In [None]:
function get_similarity_matrix(relationships)
    file = get_data_path("processed_data/$(medium)_$(medium)_related_series.csv")
    df = DataFrame(CSV.File(file))
    df = filter(x -> x.relationship ∈ relationships, df)
    sparse(
        df.source .+ 1,
        df.target .+ 1,
        fill(1.0f0, length(df.source)),
        num_items(medium),
        num_items(medium),
    )
end;

In [None]:
function all_pairs_shortest_paths(S)
    # uses the optimization that all edges are weight 1
    dists = fill(Inf32, size(S)...)
    for v = 1:size(S)[1]
        dists[v, v] = 0
    end
    T = I(size(S)[1])

    for epoch = 1:size(S)[1]
        T = T * S
        change = false
        for (i, j, _) in zip(findnz(T)...)
            if dists[i, j] > epoch
                dists[i, j] = epoch
                change = true
            end
        end
        if !change
            break
        end
    end
    dists
end;

In [None]:
function get_similarity_matrix_outdir(
    outdir,
    relationships;
    symmetric = false,
    expand = false,
    popularity_filter = false,
)
    # if the matrix is already stored on disk, return its filepath
    # otherwise, regenerate the matrix and store it to disk
    if ispath("../../data/alphas/$outdir")
        return outdir
    end

    @info "generating similarity matrix for relationships $relationships"
    S = get_similarity_matrix(relationships)
    if symmetric
        S = max.(S, S')
    end
    if expand
        S = all_pairs_shortest_paths(S)
        S[isfinite.(S)] .= 1
        S[.!isfinite.(S)] .= 0
    end
    if popularity_filter
        # only count prequels that more popular than the sequel
        popularity =
            StatsBase.countmap(get_split("training", "watch", medium, [:itemid]).itemid)
        @tprogress Threads.@threads for i = 1:num_items(medium)
            for j = 1:num_items(medium)
                if i == j
                    S[i, j] = 0
                end
                if get(popularity, i - 1, 0) > get(popularity, j - 1, 0)
                    S[i, j] = 0
                end
            end
        end
    end
    S[S.!=0] .= 1
    write_params(Dict("S" => sparse(S)), outdir, true)
    outdir
end;

In [None]:
# definitions for the relationships can be found at https://myanimelist.net/info.php?go=relationinfo
const strict_relations = Set(("sequel", "prequel", "parent_story", "side_story"))
const recap_relations = Set(("alternative_version", "summary", "full_story", "adaptation"))
const loose_relations = Set(("alternative_setting", "spin_off", "other"))
const no_relations = Set(("character",))
const all_relations =
    union(strict_relations, recap_relations, loose_relations, no_relations)
df = DataFrame(
    CSV.File(get_data_path("processed_data/$(medium)_$(medium)_related_series.csv")),
);
@assert all(map(x -> x ∈ all_relations, df.relationship))

In [None]:
get_similarity_matrix_outdir(
    "$medium/Nondirectional/RelatedSeries",
    union(strict_relations, recap_relations, loose_relations);
    symmetric = true,
    expand = true,
);

In [None]:
get_similarity_matrix_outdir(
    "$medium/Nondirectional/RecapSeries",
    recap_relations;
    symmetric = true,
);

In [None]:
get_similarity_matrix_outdir(
    "$medium/Nondirectional/SequelSeries",
    ["prequel", "parent_story"];
    expand = true,
    popularity_filter = true,
);

In [None]:
get_similarity_matrix_outdir(
    "$medium/Nondirectional/DirectSequelSeries",
    ["prequel", "parent_story"],
);