# Related Series
* Constructs a graph of structurally related series (sequels, prequels, etc.)

In [None]:
medium = ""

In [None]:
const name = "$medium/all/RelatedSeries";

In [None]:
using DataFrames
using LinearAlgebra
using SparseArrays
import CSV
import DataFrames: DataFrame
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

## Compute similarity matrices

In [None]:
function get_similarity_matrix(relationships)
    file = get_data_path("processed_data/$(medium)_$(medium)_related_series.csv")
    df = DataFrame(CSV.File(file))
    df = filter(x -> x.relationship ∈ relationships, df)
    sparse(
        df.source .+ 1,
        df.target .+ 1,
        fill(1.0f0, length(df.source)),
        num_items(medium),
        num_items(medium),
    )
end;

In [None]:
function all_pairs_shortest_paths(S)
    # uses the optimization that all edges are weight 1
    dists = fill(Inf32, size(S)...)
    for v = 1:size(S)[1]
        dists[v, v] = 0
    end
    T = I(size(S)[1])

    for epoch = 1:size(S)[1]
        T = T * S
        change = false
        for (i, j, _) in zip(findnz(T)...)
            if dists[i, j] > epoch
                dists[i, j] = epoch
                change = true
            end
        end
        if !change
            break
        end
    end
    dists
end;

In [None]:
function get_popularity()
    get_counts("training", "all", "implicit", medium; by_item = true, per_rating = false)
end;

In [None]:
function get_similarity_matrix_outdir(
    name,
    relationships;
    symmetric = false,
    expand = false,
    sequel_filtering = false,
)
    # if the matrix is already stored on disk, return its filepath
    # otherwise, regenerate the matrix and store it to disk
    outdir = "$name/similarity_matrix"
    if ispath("../../data/alphas/$outdir")
        return outdir
    end

    @info "generating similarity matrix for relationships $relationships"
    S = get_similarity_matrix(relationships)
    if symmetric
        S = max.(S, S')
    end
    if expand
        S = all_pairs_shortest_paths(S)
        S[isfinite.(S)] .= 1
        S[.!isfinite.(S)] .= 0
    end
    if sequel_filtering
        popularity = get_popularity()
        @tprogress Threads.@threads for i = 1:num_items(medium)
            for j = 1:num_items(medium)
                if i == j
                    S[i, j] = 0
                end
                # only count prequels that more popular than the series                
                if popularity[i] > popularity[j]
                    S[i, j] = 0
                end
            end
        end
    end
    write_params(Dict("S" => sparse(S)), outdir)
    outdir
end;

In [None]:
# definitions for the relationships can be found at https://myanimelist.net/info.php?go=relationinfo
const strict_relations = Set(("sequel", "prequel", "parent_story", "side_story"))
const recap_relations = Set(("alternative_version", "summary", "full_story", "adaptation"))
const loose_relations = Set(("alternative_setting", "spin_off"))
const no_relations = Set(("character", "other"))
const all_relations =
    union(strict_relations, recap_relations, loose_relations, no_relations)
df = DataFrame(
    CSV.File(get_data_path("processed_data/$(medium)_$(medium)_related_series.csv")),
);
@assert all(map(x -> x ∈ all_relations, df.relationship))

In [None]:
get_similarity_matrix_outdir(
    "$medium/all/RelatedSeries",
    union(strict_relations, recap_relations, loose_relations);
    symmetric = true,
    expand = true,
);

In [None]:
get_similarity_matrix_outdir("$medium/all/RecapSeries", recap_relations);

In [None]:
get_similarity_matrix_outdir(
    "$medium/all/SequelSeries",
    ["prequel", "parent_story"];
    sequel_filtering = true,
);

## Predict that sequels will get the same ratings as the originals

In [None]:
S = read_params("$medium/all/SequelSeries/similarity_matrix")["S"]
watched_items =
    get_split("training", "all", "explicit", medium; fields = [:user, :item, :rating])
spr =
    S * sparse(
        watched_items.item,
        watched_items.user,
        watched_items.rating,
        num_items(medium),
        num_users(medium),
    )
spw =
    S * sparse(
        watched_items.item,
        watched_items.user,
        ones(Float32, length(watched_items.user)),
        num_items(medium),
        num_users(medium),
    )


for task in ALL_TASKS
    df = reduce(
        cat,
        [
            read_alpha(
                "$medium/$task/ExplicitUserItemBiases",
                split,
                task,
                "explicit",
                medium,
            ) for split in ALL_SPLITS
        ],
    )
    spt = sparse(df.item, df.user, df.rating, num_items(medium), num_users(medium))
    function model(users, items)
        r = zeros(Float32, length(users))
        @tprogress Threads.@threads for i = 1:length(users)
            if spw[items[i], users[i]] != 0
                r[i] =
                    spr[items[i], users[i]] / spw[items[i], users[i]] -
                    spt[items[i], users[i]]
            else
                r[i] = 0
            end
        end
        r
    end

    write_alpha(
        model,
        medium,
        "$medium/$task/SequelExplicit";
        log = true,
        log_task = task,
        log_content = "explicit",
        log_alphas = ["$medium/$task/ExplicitUserItemBiases"],
    )
end

## Predict that sequels will get watched

In [None]:
S = read_params("$medium/all/SequelSeries/similarity_matrix")["S"]
watched_items = get_split(
    "training",
    "all",
    "explicit",
    medium;
    fields = [:user, :item, :rating, :status],
)
watched = sparse(
    watched_items.item,
    watched_items.user,
    watched_items.rating,
    num_items(medium),
    num_users(medium),
)
watched_items = filter(watched_items, watched_items.status .== 5)
num_watched_prequels =
    S * sparse(
        watched_items.item,
        watched_items.user,
        watched_items.rating,
        num_items(medium),
        num_users(medium),
    )
num_required_prequels = S * ones(Float32, num_items(medium));

@showprogress for (i, u, v) in zip(findnz(num_watched_prequels)...)
    num_watched_prequels[i, u] =
        (v == num_required_prequels[i] && watched[i, u] == 0) ? 1 : 0
end
preds = num_watched_prequels
pred_weights = sum(preds, dims = 1);

for task in ALL_TASKS
    function model(users, items)
        r = zeros(Float32, length(users))
        @tprogress Threads.@threads for i = 1:length(users)
            if pred_weights[users[i]] == 0
                r[i] = 1 / num_items(medium)
            else
                r[i] = preds[items[i], users[i]] / pred_weights[users[i]]
            end
        end
        r
    end
    write_alpha(
        model,
        medium,
        "$medium/$task/SequelImplicit";
        log = true,
        log_task = task,
        log_content = "implicit",
    )
end