In [None]:
# TODO split into multiple files

In [None]:
const source_name = "Recommendations";

In [None]:
task = "";

In [None]:
using DataFrames
import CSV
import NBInclude: @nbinclude
import Statistics: mean, quantile, cor
import Random
@nbinclude("../InferenceAlphas/Alpha.ipynb");
@nbinclude("Reranking.ipynb");

In [None]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

In [None]:
DEBUG = true;

In [None]:
if source == "MAL"
    sourcename = "MyAnimeList"
elseif source == "AniList"
    sourcename = "AniList"
elseif source == "XML"
    sourcename = "XML"
else
    @assert false
end
@info "Displaying recommendations for $username from $sourcename"

# Merge recommendations with item metadata

In [None]:
function to_hyperlink(title, url)
    "<a href=\"$url\">$title</a>"
end

function get_hyperlink(title, links; source = "MAL")
    if source == "MAL"
        search = "myanimelist"
    elseif source == "AniList"
        search = "anilist"
    elseif source == "XML"
        search = "MAL"
    else
        @assert false
    end

    # try to return the preferred source
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    # default to mal
    search = "myanimelist"
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    return title
end

function parse_time(x)
    minutes = Int(round(x / 60))
    hours = Int(floor(minutes / 60))
    minutes = minutes % 60
    function time_string(x, suffix)
        if x == 0
            return ""
        else
            return "$(x)$(suffix) "
        end
    end
    time_string(hours, "h") * time_string(minutes, "m")
end

function get_anime()
    df = DataFrame(
        CSV.File(
            get_data_path("processed_data/anime.csv"),
            ntasks = 1;
            stringtype = String,
        ),
    )
    df.title = get_hyperlink.(df.title, df.links; source = source)
    df[:, "long_runner"] = @. df.average_episode_duration * df.num_episodes > 26 * 30 * 60 # longer than 26 episodes of 30 minutes
    df.average_episode_duration = parse_time.(df.average_episode_duration)
    df
end;

In [None]:
function get_rec_df(task, suffix, other_suffix = nothing)
    anime = get_anime()
    anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
    anime_to_uid = innerjoin(anime_to_uid, anime, on = "animeid" => "anime_id")

    get_alpha(x) = read_recommendee_alpha("$task/$x", "all").rating
    rating_df = DataFrame(
        "uid" => 0:num_items()-1,
        "explicit" => get_alpha("Explicit"),
        "implicit" => get_alpha("LinearImplicit"),
        "implicit_seq" => get_alpha("Transformer/implicit"),
        "implicit_bag" => get_alpha("NeuralImplicitAutoencoderUntuned"),
        "linear_explicit" => get_alpha("LinearExplicit"),
        "explicit_seq" => get_alpha("Transformer/explicit"),
        "explicit_bag" => get_alpha("NeuralExplicitAutoencoderUntuned"),
        "is_sequel" => read_recommendee_alpha("all/SequelSeries", "all").rating,
    )
    rating_df[:, "score"] = get_alpha("MLE.Ensemble$suffix")
    if !isnothing(other_suffix)
        rating_df[:, "otherscore"] = get_alpha("MLE.Ensemble$other_suffix")
    end

    if DEBUG
        df = get_recommendee_split("explicit")
        rss = sum((df.rating - rating_df.explicit[df.item]) .^ 2)
        tss = sum((df.rating .- mean(df.rating)) .^ 2)
        @info "Debug Info"
        @info "RMSE: $(sqrt(rss / length(df.rating)))"
        @info "R2: $(1 - rss / tss)"
        @info "Correlation: $(cor(df.rating, rating_df.explicit[df.item]))"
    end

    # don't recommend shows that the user has already seen before
    rating_df[:, "seen"] .= false
    rating_df.seen[get_recommendee_split("implicit").item] .= true
    rating_df[:, "ptw"] .= false
    rating_df.ptw[get_recommendee_split("ptw").item] .= true


    # don't recommend shows related to shows they have seen before
    rating_df[:, "related"] .= false
    rating_df.related[read_recommendee_alpha("all/RelatedSeries", "all").rating.!=0] .= true
    rating_df[:, "recap"] .= false
    rating_df.recap[read_recommendee_alpha("all/RecapSeries", "all").rating.!=0] .= true

    rating_df.score = rating_df.score .- mean(filter(x -> !x.seen, rating_df).score)

    rec_df = innerjoin(anime_to_uid, rating_df, on = "uid")
    keepcols = [
        "animeid",
        "uid",
        "title",
        "medium",
        "num_episodes",
        "average_episode_duration",
        "long_runner",
        "start_date",
        "genres",
        "tags",
    ]
    for x in names(rating_df)
        if x ∉ keepcols
            push!(keepcols, x)
        end
    end
    rec_df = rec_df[:, keepcols]
end;

In [None]:
for (content, name) in
    [("explicit", "rated"), ("implicit", "watched"), ("ptw", "planned to watch")]
    n = length(get_recommendee_split(content).item)
    @info "$n items $name"
end

# Display options

In [None]:
display_filter(f) = (df; fn = identity) -> filter(fn ∘ f, df)
inv(f) = (x...) -> f(x...; fn = !)
seen = display_filter(x -> x.seen)
related = display_filter(x -> x.related)
recap = display_filter(x -> x.recap)
ptw = display_filter(x -> x.ptw)
tv = display_filter(x -> x.medium in ["tv", "ona"])
sequel = display_filter(x -> x.is_sequel > 0)
released = display_filter(
    x ->
        !ismissing(x.start_date) &&
            (x.start_date < timestamp_to_date(1)) &&
            (ismissing(x.num_episodes) || (x.num_episodes > 0)) ||
            ismissing(x.start_date) && !ismissing(x.num_episodes) && (x.num_episodes > 0),
)
head(n) = x -> first(x, n);
top(n, field) = x -> first(sort(x, field, rev = true), n)
top(n) = top(n, :score)
max_episodes(n) = display_filter(x -> x.num_episodes <= n);

In [None]:
function display(df::DataFrame)
    df
    df = select(
        df,
        :title,
        :score,
        Not([:title, :score, :recap, :ptw, :seen, :animeid, :uid, :tags]),
    )
    if !DEBUG
        df = select(
            df,
            Not([
                :explicit,
                :implicit,
                :score,
                :is_sequel,
                :linear_explicit,
                :implicit_seq,
                :implicit_bag,
                :explicit_seq,
                :explicit_bag,
            ]),
        )
    end
    headers = titlecase.(replace.(names(df), "_" => " "))
    headers = replace(
        headers,
        "Medium" => "Type",
        "Explicit" => "Rating",
        "Implicit" => "Watch Probability",
        "Average Episode Duration" => "Episode Length",
        "Num Episodes" => "Episodes",
    )
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        nosubheader = true,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end;

# Output recommendation list

In [None]:
get_recs(
    task,
    suffix;
    M = 500,
    N = 20,
    similarity_penalty = 2.0f0^8,
    similarity_weights = Float32[1, 1, 1],
    long_runner_constraint = 1 / 5,
    related_constraint = 1 / 11,
) =
    get_rec_df(task, suffix) |>
    inv(seen) |>
    inv(recap) |>
    inv(sequel) |>
    released |>
    max_episodes(Inf) |>
    top(M, :score) |>
    (
        x -> rerank(
            x,
            N,
            similarity_penalty = similarity_penalty,
            similarity_weights = similarity_weights,
            long_runner_constraint = long_runner_constraint,
            related_constraint = related_constraint,
        )
    ) |>
    display