In [None]:
const source_name = "Recommendations";

In [None]:
import NBInclude: @nbinclude
@nbinclude("../InferenceAlphas/Alpha.ipynb");
using DataFrames
import CSV
import Statistics: mean, quantile, cor
import Random
@nbinclude("Reranking.ipynb");

In [None]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

In [None]:
if source == "MAL"
    sourcename = "MyAnimeList"
elseif source == "AniList"
    sourcename = "AniList"
elseif source == "Kitsu"
    sourcename = "Kitsu"
else
    @assert false
end
@info "Displaying anime and manga recommendations for $username from $sourcename"

# Merge recommendations with item metadata

In [None]:
function to_hyperlink(title, url)
    "<a href=\"$url\">$title</a>"
end

function get_hyperlink(title, links, source)
    if source == "MAL"
        search = "myanimelist"
    elseif source == "AniList"
        search = "anilist"
    elseif source == "XML"
        search = "MAL"
    else
        @assert false
    end

    # try to return the preferred source
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    # default to mal
    search = "myanimelist"
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    return title
end

function parse_time(x)
    minutes = Int(round(x / 60))
    hours = Int(floor(minutes / 60))
    minutes = minutes % 60
    function time_string(x, suffix)
        if x == 0
            return ""
        else
            return "$(x)$(suffix) "
        end
    end
    time_string(hours, "h") * time_string(minutes, "m")
end

function get_media(medium::String)
    df = DataFrame(
        CSV.File(
            get_data_path("processed_data/$medium.csv"),
            ntasks = 1;
            stringtype = String,
        ),
    )
    df.title = get_hyperlink.(df.title, df.links, source)
    df
end;

In [None]:
@memoize function get_media_to_uid(medium)
    media = get_media(medium)
    media_to_uid = DataFrame(CSV.File("../../data/processed_data/$(medium)_to_uid.csv"))
    media_to_uid = innerjoin(media_to_uid, media, on = "$(medium)id" => "$(medium)_id")
end;

In [None]:
function get_rec_df(task, medium)
    media_to_uid = get_media_to_uid(medium)

    get_alpha(x, task = task) =
        read_recommendee_alpha("$medium/$task/$x", "all", medium).rating
    rating_df = DataFrame(
        "uid" => 0:num_items(medium)-1,
        "explicit" => get_alpha("LinearExplicit"),
        "implicit" => get_alpha("LinearImplicit"),
        "implicit_seq" => get_alpha("Transformer/v1/implicit"),
        "implicit_bag" => get_alpha("NeuralImplicitUniversalUntuned"),
        "explicit_seq" => get_alpha("Transformer/v1/explicit"),
        "explicit_bag" => get_alpha("NeuralExplicitUniversalUntuned"),
        "explicit_baseline" => get_alpha("ExplicitUserItemBiases"),
        "num_dependencies" => get_alpha("Dependencies", "all"),
        "is_sequel" => get_alpha("SequelSeries", "all"),
        "is_direct_sequel" => get_alpha("DirectSequelSeries", "all"),
        "is_related" => get_alpha("RelatedSeries", "all"),
        "is_recap" => get_alpha("RecapSeries", "all"),
        "is_cross_recap" => get_alpha("CrossRecapSeries", "all"),
        "is_cross_related" => get_alpha("CrossRelatedSeries", "all"),
        "score1" => get_alpha("MLE.Ensemble.1"),
        "score2" => get_alpha("MLE.Ensemble.2"),
        "score3" => get_alpha("MLE.Ensemble.3"),
        "score4" => get_alpha("MLE.Ensemble.4"),
        "score5" => get_alpha("MLE.Ensemble.5"),
        "score6" => get_alpha("MLE.Ensemble.6"),
    )

    if DEBUG
        @info "Debug Info"
        for (content, name) in
            [("explicit", "rated"), ("implicit", "watched"), ("ptw", "planned to watch")]
            n = length(get_recommendee_split(content, medium).item)
            @info "$n items $name"
        end
        df = get_recommendee_split("explicit", medium)
        if length(df.item) > 0
            rss = sum((df.rating - rating_df.explicit[df.item]) .^ 2)
            tss = sum((df.rating .- mean(df.rating)) .^ 2)
            @info "RMSE: $(sqrt(rss / length(df.rating)))"
            @info "R2: $(1 - rss / tss)"
            @info "Correlation: $(cor(df.rating, rating_df.explicit[df.item]))"
        end
    end

    # don't recommend shows that the user has already seen before
    rating_df[:, "seen"] .= false
    rating_df.seen[get_recommendee_split("implicit", medium).item] .= true
    rating_df[:, "ptw"] .= false
    rating_df.ptw[get_recommendee_split("ptw", medium).item] .= true
    # TODO something special for ptw series

    rec_df = innerjoin(media_to_uid, rating_df, on = "uid")
    rec_df[:, "uid"] .+= 1
    if medium == "anime"
        series_length = ["num_episodes"]
    elseif medium == "manga"
        series_length = ["num_volumes", "num_chapters"]
    end
    keepcols = vcat(
        ["$(medium)id", "uid", "title", "medium"],
        series_length,
        ["genres", "tags", "start_date"],
    )
    if medium == "anime"
        push!(keepcols, "start_season")
    end
    for x in names(rating_df)
        if x ∉ keepcols
            push!(keepcols, x)
        end
    end
    rec_df = rec_df[:, keepcols]
end;

# Display options

In [None]:
display_filter(f) = (df; fn = identity) -> filter(fn ∘ f, df)
inv(f) = (x...) -> f(x...; fn = !)
seen = display_filter(x -> x.seen)
ptw = display_filter(x -> x.ptw)
related = display_filter(x -> x.related != 0)
crossrecap = display_filter(x -> x.is_cross_recap != 0 && x.is_direct_sequel == 0)
recap = display_filter(x -> x.is_recap != 0)
tv = display_filter(x -> x.medium in ["tv", "ona"])
dependent = display_filter(x -> x.num_dependencies > 0 && x.is_direct_sequel == 0)
released(medium) = display_filter(
    x ->
        !ismissing(x.start_date) &&
            (x.start_date < timestamp_to_date(1, medium)) &&
            released_item(medium, x),
)
head(n) = x -> first(x, n);
top(n, field) = x -> first(sort(x, field, rev = true), n)
top(n) = top(n, :score)
max_episodes(n) = display_filter(x -> x.num_episodes <= n)
startdate(year, month = 1) =
    display_filter(x -> x.start_date >= Dates.DateTime(year, month))
search(title, col = :title) = display_filter(x -> occursin(lowercase(title), lowercase(x[col])));

In [4]:
function score(df, x)
    df[:, :score] = df[:, x]
    df
end
score(x) = df -> score(df, x);

In [None]:
function released_item(medium::String, x)
    if medium == "anime"
        return (ismissing(x.num_episodes) || (x.num_episodes > 0)) ||
               ismissing(x.start_date) && !ismissing(x.num_episodes) && (x.num_episodes > 0)
    elseif medium == "manga"
        return (ismissing(x.num_chapters) || (x.num_chapters > 0)) ||
               ismissing(x.start_date) && !ismissing(x.num_chapters) && (x.num_chapters > 0)
    else
        @assert false
    end
end

In [None]:
function display(df::DataFrame, medium::String)
    df = select(
        df,
        :title,
        :score,
        Not([:title, :score, :is_recap, :seen, Symbol("$(medium)id"), :tags]), # , :uid
    )
    if !DEBUG
        if medium == "anime"
            series_length = [:num_episodes]
        elseif medium == "manga"
            series_length = [:num_volumes, :num_chapters]
        end
        df = select(df, vcat([:title, :medium], series_length, [:start_date, :genres]))      
    end
    headers = titlecase.(replace.(names(df), "_" => " "))
    headers = replace(
        headers,
        "Medium" => "Type",
        "Explicit" => "Rating",
        "Implicit" => "Watch Probability",
        "Average Episode Duration" => "Episode Length",
        "Num Episodes" => "Episodes",
        "Num Volumes" => "Volumes",
        "Num Chapters" => "Chapters",
    )
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        nosubheader = true,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end
display(medium::String) = x -> display(x, medium);

In [None]:
recommend(
    medium;
    task = "temporal_causal",
    suffix = ".2",
    M = 500,
    N = 20,
    similarity_spec = (nothing, 0.1),
) =
    get_rec_df(task, medium, suffix) |>
    inv(seen) |>
    inv(recap) |>
    inv(dependent) |>
    inv(crossrecap) |>
    released(medium) |>
    top(M) |>
    rerank(medium, N, similarity_spec) |>
    display(medium);

In [None]:
recommend(
    medium;
    task = "temporal_causal",
    relevance = "score3",
    M = 500,
    N = 50,
    similarity_spec = (weights = nothing, penalty = 1),
    constraint_spec = (
        intrarelated = 0,
        interrelated = 1,
        crossrelated = 0.1,
        ptw = 0.1,
        seasonal = 0.25,
    ),
) =
    get_rec_df(task, medium) |>
    inv(seen) |>
    inv(recap) |>
    inv(dependent) |>
    inv(crossrecap) |>
    released(medium) |>
    score(relevance) |>
    top(M) |>
    rerank(medium, N, similarity_spec, constraint_spec) |>
    display(medium);