In [None]:
# ratelimits api calls and makes single-page requests

In [None]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

# Resources

In [None]:
import CodecZstd
import CSV
import DataFrames
import Dates
import Glob
import HTTP
import JSON3
import Memoize: @memoize
import MsgPack
import Oxygen


# Resources

function get_partition()
    # set this to (machine index, num machines) if running in a cluster
    (0, 1)
end;

const Resource = Dict{String,Any}

function load_resources()::Vector{Resource}
    env_path = "../../../environment" # TODO set
    credentials = Dict()

    # proxies
    proxies = []
    path = "$env_path/proxies/geolocations.txt"
    if ispath(path)
        geo_df = CSV.read(path, DataFrames.DataFrame)
        valid_ips = Set(filter(x -> x["geo location"] == "us", geo_df).ip)
    else
        valid_ips = nothing
    end
    path = "$env_path/proxies/proxies.txt"
    if ispath(path)
        proxy_df = CSV.read(
            path,
            DataFrames.DataFrame,
            header = ["host", "port", "username", "password"],
            delim = ':',
        )
        for (host, port, username, password) in
            zip(proxy_df.host, proxy_df.port, proxy_df.username, proxy_df.password)
            ip = split(username, "-")[end]
            if !isnothing(valid_ips) && ip ∉ valid_ips
                continue
            end
            push!(proxies, "http://$username:$password@$host:$port")
        end
    end
    proxies = sort(proxies)

    # mal (ip and token limit)
    mal_tokens =
        [only(readlines(x)) for x in Glob.glob("$env_path/mal/authentication/*.txt")]
    mal_resources = [
        Dict("location" => "mal", "token" => x, "proxyurls" => [], "ratelimit" => 8) for
        x in mal_tokens
    ]
    i = 1
    for proxy in proxies
        if length(mal_resources[i]["proxyurls"]) < 10
            # bound token size
            push!(mal_resources[i]["proxyurls"], proxy)
        end
        i = ((i + 1) % length(mal_tokens)) + 1
    end

    # malweb (ip limit)
    malweb_resources =
        [Dict("location" => "malweb", "proxyurl" => x, "ratelimit" => 4) for x in proxies]

    # anilist (ip limit)
    anilist_resources =
        [Dict("location" => "anilist", "proxyurl" => x, "ratelimit" => 4) for x in proxies]

    # kitsu (ip limit)
    kitsu_tokens = []
    for x in Glob.glob("$env_path/kitsu/authentication/*.txt")
        (username, password) = readlines(x)
        push!(kitsu_tokens, Dict("username" => username, "password" => password))
    end
    kitsu_resources = [
        Dict(
            "location" => "kitsu",
            "proxyurl" => x,
            "tokens" => kitsu_tokens,
            "ratelimit" => 4,
        ) for x in proxies
    ]

    # animeplanet (credit limit)
    animeplanet_token = only(readlines("$env_path/scrapfly/key.txt"))
    animeplanet_concurrency =
        parse(Int, only(readlines("$env_path/scrapfly/concurrency.txt")))
    animeplanet_resources = [
        Dict(
            "location" => "animeplanet",
            "token" => animeplanet_token,
            "uid" => uid,
            "ratelimit" => 8,
        ) for uid = 1:animeplanet_concurrency
    ]

    resources = vcat(
        mal_resources,
        malweb_resources,
        anilist_resources,
        kitsu_resources,
        animeplanet_resources,
    )
    # shard resources across multiple machines
    part, num_parts = get_partition()
    [x for (i, x) in Iterators.enumerate(resources) if (i % num_parts) == part]
end

struct ResourceMetadata
    version::Int
    checkout_time::Union{Nothing,Float64}
    request_times::Vector{Float64}
end

mutable struct Resources
    resources::Dict{Resource,ResourceMetadata} # resource -> (version, checkout time)
    index::Dict{String,Vector{Resource}} # location -> [resources]
    lock::ReentrantLock
end

function Resources(resources)
    index = Dict{String,Vector{Resource}}()
    for x in resources
        loc = x["location"]
        if loc ∉ keys(index)
            index[loc] = []
        end
        push!(index[loc], x)
    end
    Resources(
        Dict(x => ResourceMetadata(0, nothing, []) for x in resources),
        index,
        ReentrantLock(),
    )
end

const RESOURCES = Resources(load_resources());

function update_resources(r::Resources, refresh_secs::Real, timeout_secs::Real)
    while true
        sleep(refresh_secs)
        new_resources = Set(load_resources())
        old_resources = Set(keys(r.resources))
        lock(r.lock) do
            # update resources
            t = Dates.datetime2unix(Dates.now())
            for k in old_resources
                if k ∉ new_resources
                    # resource got assigned to a different machine
                    delete!(r.resources, k)
                    filter!(x -> x != k, r.index[k["location"]])
                else
                    m = r.resources[k]
                    if !isnothing(m.checkout_time) && t - m.checkout_time > timeout_secs
                        # resource was never returned, reclaim it
                        r.resources[k] =
                            ResourceMetadata(m.version + 1, nothing, m.request_times)
                        push_front!(r.index[k["location"]], k)
                    end
                end
            end
            for k in setdiff(new_resources, old_resources)
                # resource was added
                r.resources[k] = ResourceMetadata(0, nothing, [])
                push_front!(r.index[k["location"]], k)
            end
        end
    end
end

function take!(r::Resources, location::String, timeout::Real)
    start = Dates.datetime2unix(Dates.now())
    while true
        val = lock(r.lock) do
            if isempty(r.index[location])
                return nothing
            end
            k = popfirst!(r.index[location])
            m = r.resources[k]
            @assert isnothing(m.checkout_time)
            r.resources[k] = ResourceMetadata(
                m.version,
                Dates.datetime2unix(Dates.now()),
                m.request_times,
            )
            Dict("resource" => k, "version" => m.version)
        end
        if !isnothing(val)
            return val
        end
        if Dates.datetime2unix(Dates.now()) - start > timeout
            return nothing
        end
    end
end

function put!(r::Resources, resource::Dict, version::Integer)
    lock(r.lock) do
        if resource ∉ keys(r.resources)
            return
        end
        m = r.resources[resource]
        if m.version != version
            return
        end
        r.resources[resource] = ResourceMetadata(m.version, nothing, m.request_times)
        push!(r.index[resource["location"]], resource)
    end
end

Threads.@spawn update_resources(RESOURCES, 60.0, 600.0);

jsonpack(d::Dict)::Vector{UInt8} = Vector{UInt8}(JSON3.write(d))
jsonunpack(d::Vector{UInt8}) = JSON3.read(String(d), Dict{String,Any})
msgpack(d::Dict) = CodecZstd.transcode(CodecZstd.ZstdCompressor, MsgPack.pack(d))
msgunpack(d::Vector{UInt8}) =
    MsgPack.unpack(CodecZstd.transcode(CodecZstd.ZstdDecompressor, d))
function encode(d::Dict, encoding::Symbol)
    if encoding == :json
        headers = Dict("Content-Type" => "application/json")
        body = jsonpack(d)
    elseif encoding == :msgpack
        headers = Dict("Content-Type" => "application/msgpack")
        body = msgpack(d)
    else
        @assert false
    end
    headers, body
end
function decode(r::HTTP.Message)::Dict
    if HTTP.headercontains(r, "Content-Type", "application/json")
        return jsonunpack(r.body)
    elseif HTTP.headercontains(r, "Content-Type", "application/msgpack")
        return msgunpack(r.body)
    else
        @assert false
    end
end

Oxygen.@post "/resources" function resources_api(r::HTTP.Request)::HTTP.Response
    data = decode(r)
    if data["method"] == "take"
        token = take!(RESOURCES, data["location"], data["timeout"])
        if isnothing(token)
            return HTTP.Response(404, [])
        end
        return HTTP.Response(200, encode(token, :json)...)
    elseif data["method"] == "put"
        put!(RESOURCES, data["token"]["resource"], data["token"]["version"])
        return HTTP.Response(200, [])
    else
        @assert false
    end
end

const port = 4001 # todo parse(Int, ARGS[1])
Threads.@spawn Oxygen.serveparallel(; host = "0.0.0.0", port = port, access_log = nothing)

# Ratelimits

In [None]:
const STDOUT_LOCK = ReentrantLock()

function logerror(x::String)
    Threads.lock(STDOUT_LOCK) do
        println("$(Dates.now()) [ERROR] $x")
        flush(stdout)
    end
end;

In [None]:
const RATELIMIT_WINDOW = 1 # todo parse(Int, ARGS[2])

function ratelimit!(x::ResourceMetadata, ratelimit::Real)
    window = RATELIMIT_WINDOW
    if !isempty(x.request_times)
        startindex = max(1, length(x.request_times) - window + 1)
        times = x.request_times[startindex:end]
        wait_until = first(times) + length(times) * ratelimit
        delta = wait_until - Dates.datetime2unix(Dates.now())
        if delta > 0
            sleep(delta)
        end
    end
    push!(x.request_times, Dates.datetime2unix(Dates.now()))
    if length(x.request_times) > window
        popfirst!(x.request_times)
    end
end;

In [None]:
const LAYER_1_URL = "http://localhost:4000/proxy"; # todo parse(Int, ARGS[3])

In [None]:
struct Response
    status::Int
    body::String
    headers::Dict{String,String}
end

function callproxy(
    method::String,
    url::String,
    headers::Dict{String,String},
    body::Vector{UInt8},
    proxyurl::Union{String,Nothing},
    sessionid::String,
)
    r = HTTP.request(
        "POST",
        LAYER_1_URL,
        encode(
            Dict(
                "method" => method,
                "url" => url,
                "headers" => headers,
                "body" => body,
                "proxyurl" => proxyurl,
                "sessionid" => sessionid,
            ),
            :json,
        )...,
        status_exception = false,
    )
    if r.status >= 400
        return Response(r.status, "", Dict())
    end
    data = decode(r)
    Response(r.status, String(r.body), Dict(lowercase(k) => v for (k, v) in r.headers))
end

function request(
    resource::Resource,
    method::String,
    url::String,
    headers::Dict{String,String},
    body::Vector{UInt8},
)::Response
    metadata = lock(RESOURCES.lock) do
        get(RESOURCES.resources, resource, nothing)
    end
    if isnothing(metadata)
        return Response(500, "", Dict())
    end
    ratelimit!(metadata, resource["ratelimit"])
    if resource["location"] == "mal"
        proxyurl = rand(resource["proxyurls"])
        return callproxy(
            method,
            url,
            headers,
            body,
            proxyurl,
            string(hash((resource, proxyurl))),
        )
    elseif resource["location"] in ["malweb", "anilist", "kitsu"]
        proxyurl = resource["proxyurl"]
        return callproxy(
            method,
            url,
            headers,
            body,
            proxyurl,
            string(hash((resource, proxyurl))),
        )
    elseif resource["location"] == "animeplanet"
        sessionid = hash(resource)
        url = string(
            HTTP.URI(
                "https://api.scrapfly.io/scrape";
                query = Dict(
                    "session" => sessionid,
                    "key" => resource["token"],
                    "proxy_pool" => "public_datacenter_pool",
                    "url" => url,
                    "country" => "us",
                ),
            ),
        )
        return callproxy(method, url, headers, body, nothing, sessionid)
    else
        @assert false
    end
end;

# Single-page Requests

In [None]:
const API_VERSION = "4.2.0";

In [None]:
@memoize function html_entity_map()
    Dict(
        String(k) => v["characters"] for (k, v) in JSON3.read(read("entities.json", String))
    )
end

function html_unescape(text::AbstractString)
    text = HTTP.unescapeuri(text)
    entities = Dict(k => v for (k, v) in html_entity_map() if occursin(k, text))
    # greedy match replacements
    for k in sort(collect(keys(entities)), by = length, rev = true)
        text = replace(text, k => entities[k])
    end
    text = replace(text, entities...) # html entities
    try
        text = replace(
            text,
            [
                x.match => Char(parse(Int, only(x.captures))) for
                x in eachmatch(r"&#(\d+);", text)
            ]...,
        ) # numeric entities
        text = replace(
            text,
            [
                x.match => Char(parse(Int, only(x.captures), base = 16)) for
                x in eachmatch(r"&#x([0-9a-fA-F]+);", text)
            ]...,
        ) # hex entities
    catch
        logerror("html_unescape could not parse $text")
    end
    text
end;

In [None]:
Oxygen.@post "/mal" function mal_api(r::HTTP.Request)::HTTP.Response
    data = decode(r)
    endpoint = data["endpoint"]
    token = data["token"]
    resource = token["resource"]
    if resource["location"] != "mal"
        logerror("""mal_api invalid resource $(resource["location"])""")
        return HTTP.Response(500, [])
    end
    try
        if endpoint == "medialist"
            return mal_get_list(resource, data["username"], data["medium"], data["offset"])
        else
            logerror("mal_api invalid endpoint $endpoint")
            return HTTP.Response(500, [])
        end
    catch e
        args = Dict(k => v for (k, v) in data if k != "token")
        logerror("mal_api error $e for $args")
        return HTTP.Response(500, [])
    end
end

function mal_get_list(resource::Resource, username::String, medium::String, offset::Int)
    if medium == "anime"
        progress_col = "num_episodes_watched"
        repeat_col = "is_rewatching"
        repeat_count_col = "num_times_rewatched"
        repeat_value_col = "rewatch_value"
    elseif medium == "manga"
        progress_col = "num_chapters_read"
        repeat_col = "is_rereading"
        repeat_count_col = "num_times_reread"
        repeat_value_col = "reread_value"
    else
        @assert false
    end
    query = "https://api.myanimelist.net/v2/users/$username/$(medium)list"
    params = "limit=1000&fields=list_status&nsfw=true"
    if offset != 0
        params = "offset=$offset&$params"
    end
    entries = []
    headers = Dict("X-MAL-CLIENT-ID" => resource["token"])
    url = "$query?$params"
    r = request(resource, "GET", url, headers, UInt8[])
    if r.status >= 400
        logerror("mal_get_list received status $(r.status) for $url")
        return HTTP.Response(r.status, [])
    end
    json = JSON3.read(JSON3.read(r.body)["content"])
    if "data" ∉ keys(json)
        logerror("mal_get_list received empty json $(keys(json)) for $url")
        return HTTP.Response(500, [])
    end
    optget(x::AbstractDict, k::String) = get(x, k, nothing)
    for x in json["data"]
        ls = x["list_status"]
        d = Dict(
            "version" => API_VERSION,
            "username" => username,
            "uid" => x["node"]["id"],
            "status" => optget(ls, "status"),
            "score" => optget(ls, "score"),
            "progress" => optget(ls, progress_col),
            "progress_volumes" => optget(ls, "num_volumes_read"),
            "started_at" => optget(ls, "start_date"),
            "completed_at" => optget(ls, "finish_date"),
            "priority" => optget(ls, "priority"),
            "repeat" => optget(ls, "repeat_col"),
            "repeat_count" => optget(ls, repeat_count_col),
            "repeat_value" => optget(ls, repeat_value_col),
            "tags" => optget(ls, "tags"),
            "repeat" => optget(ls, "repeat_col"),
            "tags" => optget(ls, "tags"),
            "notes" => optget(ls, "comments"),
            "tags" => optget(ls, "tags"),
            "updated_at" => optget(ls, "updated_at"),
        )
        if !isnothing(d["updated_at"])
            try
                d["updated_at"] = Dates.datetime2unix(
                    Dates.DateTime(
                        d["updated_at"],
                        Dates.dateformat"yyyy-mm-ddTHH:MM:SS+00:00",
                    ),
                )
            catch
                logerror("mal_get_list cannot parse date " * string(["updated_at"]))
            end
        end
        push!(entries, d)
    end
    ret = Dict("data" => entries, "offset" => offset)
    if "next" in keys(json["paging"])
        ret["next"] = json["paging"]["next"]
    end
    HTTP.Response(200, encode(ret, :json)...)
end;

In [None]:
Oxygen.@post "/malweb" function malweb_api(r::HTTP.Request)::HTTP.Response
    data = decode(r)
    endpoint = data["endpoint"]
    token = data["token"]
    resource = token["resource"]
    if resource["location"] != "malweb"
        logerror("""malweb_api invalid resource $(resource["location"])""")
        return HTTP.Response(500, [])
    end
    try
        if endpoint == "media"
            return malweb_get_media(resource, data["medium"], data["itemid"])
        elseif endpoint in ["username"]
            return malweb_get_username(resource, data["userid"])
        else
            logerror("malweb_api invalid endpoint $endpoint")
            return HTTP.Response(500, [])
        end
    catch e
        args = Dict(k => v for (k, v) in data if k != "token")
        logerror("malweb_api error $e for $args")
        return HTTP.Response(500, [])
    end
end

function malweb_get_username(resource::Resource, userid::String)
    url = "https://myanimelist.net/comments.php?id=$userid"
    r = request(resource, "GET", url, Dict{String,String}(), UInt8[])
    if r.status >= 400
        logerror("malweb_get_username received status $(r.status) for $url")
        return HTTP.Response(r.status, [])
    end
    for x in split(r.body, "\n")
        println(x)
    end
    for m in eachmatch(r"/profile/([^\"/%]+)\"", r.body)
        a = only(m.captures)
        @info a
        @info length(a)
        @info typeof(a)
        username = html_unescape(only(m.captures))
        ret = Dict("username" => username)
        return HTTP.Response(200, encode(ret, :json)...)
    end
    HTTP.Response(404, [])
end

In [None]:
# @time r = HTTP.request(
#     "POST",
#     "http://localhost:4001/resources",
#     encode(Dict("method" => "take", "location" => "malweb", "timeout" => 0), :json)...,
#     status_exception = false,
# )
# token = decode(r);

In [None]:
# @time r2 = HTTP.request(
#     "POST",
#     "http://localhost:4001/malweb",
#     encode(
#         Dict("token" => token, "endpoint" => "username", "userid" => "15982320"),
#         :json,
#     )...,
#     status_exception = false,
# );

In [None]:
# @time r = HTTP.request(
#     "POST",
#     "http://localhost:4001/resources",
#     encode(Dict("method" => "put", "token" => token), :json)...,
#     status_exception = false,
# )