In [None]:
const source_name = "Recommendations";

In [None]:
using DataFrames
import CSV
import NBInclude: @nbinclude
import Statistics: mean, quantile, cor
@nbinclude("../InferenceAlphas/Alpha.ipynb");

In [None]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

In [None]:
function to_hyperlink(title, url)
    "<a href=\"$url\">$title</a>"
end

function get_hyperlink(title, links; source = "MAL")
    if source == "MAL"
        search = "myanimelist"
    elseif source == "AniList"
        search = "anilist"
    else
        @assert false
    end

    # try to return the preferred source
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    # default to mal
    search = "myanimelist"
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    return title
end

function get_anime()
    df = DataFrame(
        CSV.File("../../data/processed_data/anime.csv", ntasks = 1; stringtype = String),
    )
    df.title = get_hyperlink.(df.title, df.links; source=source)
    df
end;

In [None]:
const anime = get_anime()
anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
anime_to_uid = innerjoin(anime_to_uid, anime, on = "animeid" => "anime_id");

In [None]:
get_alpha(x) = read_recommendee_alpha(x, "all").rating
rating_df = DataFrame(
    "uid" => 0:num_items()-1,
    "explicit" => get_alpha("Explicit"),
    "implicit" => get_alpha("LinearImplicit"),
    "baseline_explicit" => get_alpha("ExplicitUserItemBiases"),
    "baseline_implicit" => get_alpha("NeuralImplicitItemBiases"),
)
rating_df[:, "score"] = -get_alpha("MLE.Ensemble");

In [None]:
# evaluate our insample explicit predictions
df = get_recommendee_split("explicit")
rss = sum((df.rating - rating_df.explicit[df.item]) .^ 2)
tss = sum((df.rating .- mean(df.rating)) .^ 2)
@info "Debug Data"
@info "RMSE: $(sqrt(rss / length(df.rating)))"
@info "R2: $(1 - rss / tss)"
@info "Correlation: $(cor(df.rating, rating_df.explicit[df.item]))"

In [None]:
# don't recommend shows that the user has already seen before
rating_df[:, "seen"] .= false
rating_df.seen[get_recommendee_split("implicit").item] .= true
rating_df[:, "ptw"] .= false
rating_df.ptw[get_recommendee_split("ptw").item] .= true

# don't recommend shows related to shows they have seen before
rating_df[:, "related"] .= false
rating_df.related[read_recommendee_alpha("RelatedSeries", "all").rating .!= 0] .= true;
rating_df[:, "recap"] .= false
rating_df.recap[read_recommendee_alpha("RecapSeries", "all").rating .!= 0] .= true;

In [None]:
rec_df = innerjoin(anime_to_uid, rating_df, on = "uid");
keepcols = ["animeid", "uid", "title",  "medium", "num_episodes", "start_date", "genres", "tags"]
for x in names(rating_df)
    if x ∉ keepcols
        push!(keepcols, x)
    end
end
rec_df = rec_df[:, keepcols];

In [None]:
for (content, name) in [("explicit", "rated"), ("implicit", "watched"), ("ptw", "planned to watch")]
    n = length(get_recommendee_split(content).item)
    @info "$n items $name"
end

In [None]:
# function is_recent(x)
#     if ismissing(x)
#         return false
#     end
#     (x > Dates.today() - Dates.Month(6)) && (x < Dates.today())
# end

In [None]:
function top(df::DataFrame, n::Integer)
    df = first(sort(df, :score), n)
    df = select(df, Not([:recap, :related, :ptw, :seen, :animeid, :uid]))
    headers = titlecase.(replace.(names(df), "_" => " "))
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        nosubheader = true,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end

top(n::Integer) = x -> top(x, n);

In [None]:
seen(df) = filter(x -> x.seen, df)
inv_seen(df) = filter(x -> !x.seen, df)
inv_related(df) = filter(x -> !x.related, df)
inv_recap(df) = filter(x -> !x.recap, df)
ptw(df) = filter(x -> x.ptw, df)
related(df) = filter(x -> !x.seen && x.related, df)
explicit(df) = filter(x -> x.explicit > x.baseline_explicit, df)
implicit(df) = filter(x -> x.implicit > x.baseline_implicit, df)
inv_explicit(df) = filter(x -> x.explicit <= x.baseline_explicit, df)
inv_implicit(df) = filter(x -> x.implicit <= x.baseline_implicit, df)
tv(df) = filter(x -> x.medium in ["tv", "ona"], df)
inv_tv(df) = filter(x -> x.medium ∉ ["tv", "ona"], df)
recent(df) = filter(x -> is_recent.(x.start_date), df)