In [None]:
const source_name = "Recommendations";

In [None]:
using DataFrames
import CSV
import NBInclude: @nbinclude
import Statistics: mean, quantile, cor
@nbinclude("../InferenceAlphas/Alpha.ipynb");

In [None]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

In [None]:
DEBUG = false

In [None]:
if source == "MAL"
    sourcename = "MyAnimeList"
elseif source == "AniList"
    sourcename = "AniList"
elseif source == "XML"
    sourcename = "XML"
else
    @assert false
end
@info "Displaying recommendations for $username from $sourcename"

# Merge recommendations with item metadata

In [None]:
function to_hyperlink(title, url)
    "<a href=\"$url\">$title</a>"
end

function get_hyperlink(title, links; source = "MAL")
    if source == "MAL"
        search = "myanimelist"
    elseif source == "AniList"
        search = "anilist"
    elseif source == "XML"
        search = "MAL"
    else
        @assert false
    end

    # try to return the preferred source
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    # default to mal
    search = "myanimelist"
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    return title
end

function parse_time(x)
    minutes = Int(round(x / 60))
    hours = Int(floor(minutes / 60))
    minutes = minutes % 60
    function time_string(x, suffix)
        if x == 0
            return ""
        else
            return "$(x)$(suffix) "
        end
    end
    time_string(hours, "h") * time_string(minutes, "m")
end

function get_anime()
    df = DataFrame(
        CSV.File("../../data/processed_data/anime.csv", ntasks = 1; stringtype = String),
    )
    df.title = get_hyperlink.(df.title, df.links; source = source)
    df.average_episode_duration = parse_time.(df.average_episode_duration)
    df
end;

In [None]:
const anime = get_anime()
anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
anime_to_uid = innerjoin(anime_to_uid, anime, on = "animeid" => "anime_id");

In [None]:
get_alpha(x) = read_recommendee_alpha(x, "all").rating
rating_df = DataFrame(
    "uid" => 0:num_items()-1,
    "explicit" => get_alpha("Explicit"),
    "implicit" => get_alpha("LinearImplicit"),
    "baseline_explicit" => get_alpha("ExplicitUserItemBiases"),
    "baseline_implicit" => get_alpha("NeuralImplicitItemBiases"),
)
rating_df[:, "score"] = get_alpha("MLE.Ensemble")
rating_df[:, "relative_explicit"] =
    rating_df[:, "explicit"] ./ rating_df[:, "baseline_explicit"]
rating_df[:, "relative_implicit"] =
    rating_df[:, "implicit"] ./ rating_df[:, "baseline_implicit"];

In [None]:
# don't recommend shows that the user has already seen before
rating_df[:, "seen"] .= false
rating_df.seen[get_recommendee_split("implicit").item] .= true
rating_df[:, "ptw"] .= false
rating_df.ptw[get_recommendee_split("ptw").item] .= true

# don't recommend shows related to shows they have seen before
rating_df[:, "related"] .= false
rating_df.related[read_recommendee_alpha("RelatedSeries", "all").rating.!=0] .= true;
rating_df[:, "recap"] .= false
rating_df.recap[read_recommendee_alpha("RecapSeries", "all").rating.!=0] .= true;

rating_df.score = rating_df.score .- mean(filter(x -> !x.seen, rating_df).score);

In [None]:
rec_df = innerjoin(anime_to_uid, rating_df, on = "uid");
keepcols = [
    "animeid",
    "uid",
    "title",
    "medium",
    "num_episodes",
    "average_episode_duration",
    "start_date",
    "genres",
    "tags",
]
for x in names(rating_df)
    if x ∉ keepcols
        push!(keepcols, x)
    end
end
rec_df = rec_df[:, keepcols];

# Print debugging information

In [None]:
if DEBUG
    df = get_recommendee_split("explicit")
    rss = sum((df.rating - rating_df.explicit[df.item]) .^ 2)
    tss = sum((df.rating .- mean(df.rating)) .^ 2)
    @info "Debug Info"
    @info "RMSE: $(sqrt(rss / length(df.rating)))"
    @info "R2: $(1 - rss / tss)"
    @info "Correlation: $(cor(df.rating, rating_df.explicit[df.item]))"
end

In [None]:
for (content, name) in
    [("explicit", "rated"), ("implicit", "watched"), ("ptw", "planned to watch")]
    n = length(get_recommendee_split(content).item)
    @info "$n items $name"
end

# Create display filters

In [None]:
display_filter(f) = (df; fn = identity) -> filter(fn ∘ f, df)
inv(f) = (x...) -> f(x...; fn = !)
seen = display_filter(x -> x.seen)
related = display_filter(x -> x.related)
recap = display_filter(x -> x.recap)
ptw = display_filter(x -> x.ptw)
tv = display_filter(x -> x.medium in ["tv", "ona"])
top(n) = x -> first(sort(x, :score, rev = true), n);

In [None]:
function is_recent(x)
    if ismissing(x)
        return false
    end
    (x > Dates.today() - Dates.Month(6)) && (x < Dates.today())
end;
recent = display_filter(x -> is_recent.(x.start_date));

In [None]:
function outlier_filter(df::DataFrame, n::Integer, p::Number; fn::Function)
    max_outliers = Int(floor(n * p)) + 1
    df = df |> top(n + max_outliers) |> copy
    q_imp = quantile(df.relative_implicit, p)
    ϵ = eps(eltype(q_imp))
    df = filter(
        x -> fn(
            (x.relative_implicit >= q_imp - ϵ) ||
                (x.relative_implicit >= 1 && x.relative_explicit >= 1),
        ),
        df,
    )
    df |> top(n)
end
outlier_filter(n, p; fn = identity) = (x; fn = fn) -> outlier_filter(x, n, p; fn = fn);

In [None]:
function display(df::DataFrame)
    df = select(
        df,
        Not([
            :recap,
            :related,
            :ptw,
            :seen,
            :animeid,
            :uid,
            :baseline_implicit,
            :baseline_explicit,
        ]),
    )
    if !DEBUG
        df = select(df, Not([:explicit, :implicit, :relative_explicit, :relative_implicit]))
    end
    headers = titlecase.(replace.(names(df), "_" => " "))
    headers = replace(
        headers,
        "Medium" => "Type",
        "Explicit" => "Rating",
        "Implicit" => "Watch Probability",
        "Relative Explicit" => "Relative Rating",
        "Relative Implicit" => "Relative Watch Probability",
        "Average Episode Duration" => "Episode Length",
        "Num Episodes" => "Episodes",
    )
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        nosubheader = true,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end;

In [None]:
curated_feed(n) = rec_df |> inv(seen) |> inv(related) |> outlier_filter(n, 0.1) |> display
outlier_feed(n) =
    rec_df |> inv(seen) |> inv(related) |> inv(outlier_filter(n, 0.1)) |> display
continue_watching_tv(n) = rec_df |> tv |> related |> inv(seen) |> top(n) |> display
continue_watching_specials(n) =
    rec_df |> inv(tv) |> related |> inv(seen) |> top(n) |> display
plan_to_watch(n) = rec_df |> ptw |> top(n) |> display;