In [None]:
import NBInclude: @nbinclude
@nbinclude("../../TrainingAlphas/Alpha.ipynb");
using DataFrames
import Dates
import CSV
import Statistics: mean
# @nbinclude("Reranking.ipynb")
# @nbinclude("AlsoWatched.ipynb")
# @nbinclude("Explanations.ipynb");

In [None]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

# Get media

In [None]:
function to_hyperlink(title, url)
    "<a href=\"$url\">$title</a>"
end

function get_hyperlink(title, links, source)
    if source == "mal"
        search = "myanimelist"
    elseif source == "anilist"
        search = "anilist"
    elseif source == "kitsu"
        search = "kitsu"
    elseif source == "animeplanet"
        search = "anime-planet"
    else
        @assert false
    end

    # try to return the preferred source
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    # default to mal
    search = "myanimelist"
    parsed_links = eval(Meta.parse(replace(links, "'" => "\"")))
    for link in parsed_links
        if occursin(search, link)
            return to_hyperlink(title, link)
        end
    end

    return title
end

function parse_time(x)
    minutes = Int(round(x / 60))
    hours = Int(floor(minutes / 60))
    minutes = minutes % 60
    function time_string(x, suffix)
        if x == 0
            return ""
        else
            return "$(x)$(suffix) "
        end
    end
    time_string(hours, "h") * time_string(minutes, "m")
end

function get_media(medium::String, source::String)
    df = DataFrame(
        CSV.File(
            get_data_path("processed_data/$medium.csv"),
            ntasks = 1;
            stringtype = String,
        ),
    )
    df.title = get_hyperlink.(df.title, df.links, source)
    # validate fields
    valid_statuses = Dict(
        "anime" => ["Currently Airing", "Finished Airing", "Not yet aired"],
        "manga" => [
            "Not yet published",
            "Publishing",
            "Discontinued",
            "Finished",
            "On Hiatus",
        ],
    )
    @assert issubset(Set(df.status), Set(valid_statuses[medium]))
    df
end

function prune_media_df(df, medium)
    if medium == "anime"
        series_length = ["num_episodes"]
    elseif medium == "manga"
        series_length = ["num_volumes", "num_chapters"]
    end
    keepcols = vcat(
        ["mediaid", "uid", "title", "type"],
        series_length,
        ["status", "start_date", "end_date", "genres", "tags"],
    )
    df[:, keepcols]
end

@memoize function get_media_df(medium, source)
    media = get_media(medium, source)
    media_to_uid = DataFrame(CSV.File(get_data_path("processed_data/$(medium)_to_uid.csv")))
    df = innerjoin(media_to_uid, media, on = "mediaid" => "$(medium)_id")
    prune_media_df(df, medium)
end;

In [None]:
# function get_link(medium, source, uid)
#     if isnothing(uid)
#         return nothing
#     end
#     media = get_media_to_uid(medium, source)
#     if uid in media.uid
#         return media.title[findfirst(x -> x == uid, media.uid)]
#     elseif source != "MAL"
#         return get_link(medium, "MAL", uid)
#     else
#         @debug "link not found for $medium $source $uid"
#         return nothing
#     end
# end;

# Get rankings

In [None]:
function get_rating_df(medium)
    get_alpha(x) = get_raw_split("rec_inference", medium, [:itemid], x).alpha
    rating_df = DataFrame(
        "uid" => 0:num_items(medium)-1,
        "ranking" => get_alpha("$medium/Ranking"),
        "rating" => get_alpha("$medium/Linear/rating"),
        "watch" => get_alpha("$medium/Linear/watch"),
        "plantowatch" => get_alpha("$medium/Linear/plantowatch"),
        "drop" => get_alpha("$medium/Linear/drop"),
        "num_dependencies" => get_alpha("$medium/Nondirectional/Dependencies"),
        "is_sequel" => get_alpha("$medium/Nondirectional/SequelSeries"),
        "is_direct_sequel" => get_alpha("$medium/Nondirectional/DirectSequelSeries"),
        "is_related" => get_alpha("$medium/Nondirectional/RelatedSeries"),
        "is_recap" => get_alpha("$medium/Nondirectional/RecapSeries"),
        "is_cross_recap" => get_alpha("$medium/Nondirectional/CrossRecapSeries"),
        "is_cross_related" => get_alpha("$medium/Nondirectional/CrossRelatedSeries"),
    )
    rating_df[:, "score"] .= (
        rating_df.rating +
        (log.(rating_df.watch) ./ log(10)) +
        0.1 * (log.(rating_df.plantowatch) ./ log(10)) +
        (-max.(rating_df.drop, 0.01) * 10)
    )

    rating_df[:, "seen"] .= false
    seen_df = get_raw_split("rec_training", medium, [:itemid], nothing)
    rating_df.seen[seen_df.itemid.+1] .= true
    rating_df[:, "ptw"] .= false
    ptw_df = get_split("rec_training", "plantowatch", medium, [:itemid], nothing)
    rating_df.ptw[ptw_df.itemid.+1] .= true
    rating_df.seen[ptw_df.itemid.+1] .= false
    rating_df
end

function get_ranking_df(medium, source)
    rating_df = get_rating_df(medium)
    media_df = get_media_df(medium, source)
    innerjoin(media_df, rating_df, on = "uid")
end;

# Display options

In [None]:
display_filter(f) = (df; fn = identity) -> filter(fn ∘ f, df)
inv(f) = (x...) -> f(x...; fn = !);

In [None]:
# filter by related series
seen = display_filter(x -> x.seen) # item has been seen
related = display_filter(x -> x.related != 0) # item is related to a seen item
cross_related = display_filter(x -> x.is_cross_related != 0) # item is related to a seen item in a different media
recap = display_filter(x -> x.is_recap != 0 && x.is_direct_sequel == 0) # item is a recap of a seen item
cross_recap = display_filter(x -> x.is_cross_recap != 0 && x.is_direct_sequel == 0) # item is a recap of a seen item in a different media
dependent = display_filter(x -> x.num_dependencies > 0 && x.is_direct_sequel == 0); # item is a sequel that we havent seen the prequel for

In [None]:
# filter by date
function parse_date(x)
    if x in ["Not available"]
        return nothing
    end
    fields = split(x, " ")
    if length(fields) == 3
        date_format = "u d, Y"
    elseif length(fields) == 2
        date_format = "u Y"
    elseif length(fields) == 1
        date_format = "Y"
    else
        @assert false x
    end
    parsed_date = Dates.DateTime(x, date_format)
    Int(Dates.datetime2unix(parsed_date))
end

function released_after(x, timestamp)
    release_date = parse_date(x.start_date)
    if isnothing(release_date)
        return false
    else
        return timestamp < release_date
    end
end

function released_before(x, timestamp)
    release_date = parse_date(x.start_date)
    if isnothing(release_date)
        return false
    else
        return release_date < timestamp
    end
end

after(year, month = 1, date = 1) = display_filter(
    x -> released_after(x, Dates.datetime2unix(Dates.DateTime(year, month, date))),
)
before(year, month = 1, date = 1) = display_filter(
    x -> released_before(x, Dates.datetime2unix(Dates.DateTime(year, month, date))),
)
status(s) = display_filter(x -> x.status == s)
function released(medium)
    if medium == "anime"
        return inv(status("Not yet aired"))
    elseif medium == "manga"
        return inv(status("Not yet published"))
    else
        @assert false
    end
end;

In [None]:
# filter by content
max_episodes(n) = display_filter(x -> x.num_episodes <= n)
search(key::String, col) = display_filter(x -> occursin(lowercase(key), lowercase(x[col])))
search(key, col) = display_filter(x -> x[col] == key)
search(key::Vector, col) = display_filter(x -> x[col] in key)
search(key::String) = search(key, :title);

In [None]:
# filter by score
head(n) = x -> first(x, n);
top(n, field) = x -> first(sort(x, field, rev = true), n)
top(n) = top(n, :score);

In [None]:
function display(df::DataFrame, debug::Bool)
    # select columns to display
    if debug
        df = select(df, Not([:tags]))
    else
        cols = [
            "title",
            "type",
            "num_episodes",
            "num_volumes",
            "num_chapters",
            "start_date",
            "end_date",
            "genres",
        ]
        cols = [x for x in cols if x in names(df)]
        df = select(df, Symbol.(cols))
    end
    headers = titlecase.(replace.(names(df), "_" => " "))
    headers = replace(
        headers,
        "Title" => "Recommendation",
        "Num Episodes" => "Episodes",
        "Num Volumes" => "Volumes",
        "Num Chapters" => "Chapters",
        # "Explanation" => "Because You Watched",
    )
    # print as html
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        show_row_number = false,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end

display(debug::Bool) = x -> display(x, debug);

# Recommendations

In [None]:
function recommend(medium; M = 1000, N = 100, debug = false)
    get_ranking_df(medium, source) |>
    released(medium) |>
    inv(seen) |>
    inv(recap) |>
    inv(cross_recap) |>
    inv(dependent) |>
    top(1000, :watch) |>
    top(100, :score) |>
    display(debug)
end;

In [None]:
# function recommend(
#     medium;
#     task = "temporal_causal",
#     source = source,
#     relevance = "MLE.Ensemble.3",
#     M = 500,
#     N = 100,
#     similarity_spec = (weights = nothing, penalty = 1),
#     constraint_spec = (
#         intrarelated = 0,
#         interrelated = 0.25,
#         crossrelated = 0.1,
#         ptw = 0.1,
#         seasonal = 0.25,
#     ),
#     extra_filters = identity,
#     n_explanations = 2,
# )
#     # filter out invalid items
#     allowed_df =
#         get_rec_df(task, medium, source, relevance) |>
#         inv(seen) |>
#         inv(recap) |>
#         inv(dependent) |>
#         inv(crossrecap) |>
#         extra_filters

#     # rank items
#     rec_df =
#         allowed_df |>
#         released(medium) |>
#         top(M) |>
#         rerank(medium, N, similarity_spec, constraint_spec)

#     # attach metadata
#     attach_also_watched!(rec_df, allowed_df, medium)
#     attach_explanations!(rec_df, medium, task, relevance, n_explanations)

#     rec_df |> display(medium)
# end;