In [1]:
import JupyterFormatter
JupyterFormatter.enable_autoformat();

In [2]:
import Dates
import HTTP
import JSON
import Memoize: @memoize

const API_VERSION = "4.2.0"
const STDOUT_LOCK = ReentrantLock()

function logerror(x::String)
    Threads.lock(COOKIE_LOCK) do
        println("$(Dates.now()) [ERROR] $x")
        flush(stdout)
    end
end

@memoize function html_entity_map()
    Dict(k => v["characters"] for (k, v) in JSON.parsefile("entities.json"))
end

function unescape(text::AbstractString)
    text = HTTP.unescapeuri(text)
    entities = Dict(k => v for (k, v) in html_entity_map() if occursin(k, text))
    # greedy match replacements
    for k in sort(collect(keys(entities)), by = length, rev = true)
        text = replace(text, k => entities[k])
    end
    text = replace(text, entities...) # html entities
    try
        text = replace(
            text,
            [
                x.match => Char(parse(Int, only(x.captures))) for
                x in eachmatch(r"&#(\d+);", text)
            ]...,
        ) # numeric entities
        text = replace(
            text,
            [
                x.match => Char(parse(Int, only(x.captures), base = 16)) for
                x in eachmatch(r"&#x([0-9a-fA-F]+);", text)
            ]...,
        ) # hex entities
    catch
        logerror("could not unescape $text")
    end
    text
end

struct Session
    sessionid::String
    proxy::String
    proxytype::String
    request_times::Vector{Float64}
    ratelimit_period::Float64
end

function Session(; sessionid, proxy, proxytype, ratelimit_period)
    Session(
        sessionid,
        proxy,
        proxytype,
        Float64[],
        ratelimit_period,
    )
end

struct Response
    status::Int
    body::String
    headers::HTTP.Headers
end

function Response(r::HTTP.Response)
    Response(r.status, String(r.body), r.headers)
end

function ratelimit!(times::Vector{Float64}, period)
    push!(times, Dates.datetime2unix(Dates.now()))
    MAX_TIMES = 8
    if length(times) > MAX_TIMES
        popfirst!(times)
    end
    wait_until = times[1] + (length(times) - 1) * period
    delta = wait_until - times[end]
    if delta > 0
        sleep(delta)
    end
end

function request(
    s::Session,
    method::String,
    url::String,
    headers::Dict{String,String},
    body::Vector{UInt8},
    ::Nothing,
)
    ratelimit!(s.request_times, s.ratelimit_period)
    # if s.proxytype == "brightdata"
    #     host, port, username, password = split(s.proxy, ":")
    #     proxyurl = "http://$username:$password@$host:$port"
    #     return HTTP.request(
    #         method,
    #         url,
    #         headers,
    #         body;
    #         proxy = proxyurl,
    #         cookiejar = s.cookies,
    #         status_exception = false,
    #     )
    # else
    #     s.proxytype == ""
    #     return HTTP.request(
    #         method,
    #         url,
    #         headers,
    #         body;
    #         cookiejar = s.cookies,
    #         status_exception = false,
    #     )
    # end
    # connect to proxy
    @assert false
end

function request(
    s::Session,
    method::String,
    url::String,
    headers::Dict{String,String},
    body::Vector{UInt8},
    should_retry::Function,
)
    for delay in ExponentialBackOff(;
        n = 9,
        first_delay = 1,
        max_delay = 300,
        factor = 2.0,
        jitter = 0.1,
    )
        r = request(s, method, url, headers, body, nothing)
        if should_retry(r)
            resp_headers = Dict(lowercase(k) => v for (k, v) in r.headers)
            if "retry-after" in keys(resp_headers)
                try
                    timeout = min(parse(Int, resp_headers["retry-after"]), max_delay)
                    logerror("retry-after $url after $delay seconds")
                    sleep(timeout)
                catch
                    logerror("invalid retry-after $resp_headers for $url")
                    sleep(delay)
                end
            else
                logerror("retrying $url after $delay seconds")
                sleep(delay)
            end
        else
            return Response(r)
        end
    end
    Response(500, "", HTTP.Headers())
end

function fetch(
    sessionid::String,
    proxy::String,
    proxytype::String,
    ratelimit_period::Float64,
    auth_token::Union{String,Nothing},
    source::String,
    resource::String,
    query::String,
)
    s = Session(
        sessionid = sessionid,
        proxy = proxy,
        proxytype = proxytype,
        ratelimit_period = ratelimit_period,
    )
    if source == "mal"
        if resource in ["mangalist", "animelist"]
            return mal_get_list(s, auth_token, resource, query)
        elseif resource in ["manga", "anime"]
            return mal_get_media(s, auth_token, resource, query)
        elseif resource in ["username"]
            return mal_get_username(s, auth_token, query)
        end
    end
    if source == "anilist"
        if resource in ["mangalist", "animelist"]
            return anilist_get_list(s, resource, query)
        elseif resource in ["manga", "anime"]
            return anilist_get_media(s, resource, query)
        elseif resource in ["userid"]
            return anilist_get_userid(s, query)
        end
    end
    if source == "kitsu"
        if resource in ["mangalist", "animelist"]
            return kitsu_get_list(s, auth_token, resource, query)
        elseif resource in ["manga", "anime"]
            return kitsu_get_media(s, auth_token, resource, query)
        elseif resource in ["userid"]
            return kitsu_get_userid(s, auth_token, query)
        elseif resource in ["token"]
            return kitsu_get_token(s, query)
        end
    end
    @assert false
end;

# MyAnimeList

In [3]:
function should_retry_mal(r)
    if HTTP.iserror(r) && r.status ∉ [403, 404]
        # 403 error if list is private
        # 404 error if invalid url
        logerror("status code $(r.status) for $(r.request.url)")
        return true
    end
    json = try
        JSON.parse(String(copy(r.body)))
    catch
        logerror("cannot parse json for $(r.request.url) $(String(copy(r.body)))")
        return true
    end
    if isempty(keys(json))
        logerror("empty response for $(r.request.url)")
        return true
    end
    false
end

function mal_get_list(s::Session, auth_token::String, resource::String, username::String)
    if resource == "animelist"
        progress_col = "num_episodes_watched"
        repeat_col = "is_rewatching"
        repeat_count_col = "num_times_rewatched"
        repeat_value_col = "rewatch_value"
        medium = "anime"
    elseif resource == "mangalist"
        progress_col = "num_chapters_read"
        repeat_col = "is_rereading"
        repeat_count_col = "num_times_reread"
        repeat_value_col = "reread_value"
        medium = "manga"
    else
        @assert false
    end
    headers = Dict("X-MAL-CLIENT-ID" => auth_token)
    url = (
        "https://api.myanimelist.net/v2/users/" *
        "$username/$(medium)list?limit=1000&fields=list_status&nsfw=true"
    )
    safeget(x::Dict, k::String) = get(x, k, nothing)
    entries = []
    iter = 0
    max_iters = 1000
    while true
        r = request(s, "GET", url, headers, UInt8[], should_retry_mal)
        if r.status >= 400
            logerror("received status $(r.status) for $url")
            return nothing
        end
        json = JSON.parse(String(r.body))
        for x in json["data"]
            ls = x["list_status"]
            d = Dict(
                "version" => API_VERSION,
                "username" => username,
                "uid" => x["node"]["id"],
                "status" => safeget(ls, "status"),
                "score" => safeget(ls, "score"),
                "progress" => safeget(ls, progress_col),
                "progress_volumes" => safeget(ls, "num_volumes_read"),
                "started_at" => safeget(ls, "start_date"),
                "completed_at" => safeget(ls, "finish_date"),
                "priority" => safeget(ls, "priority"),
                "repeat" => safeget(ls, "repeat_col"),
                "repeat_count" => safeget(ls, repeat_count_col),
                "repeat_value" => safeget(ls, repeat_value_col),
                "tags" => safeget(ls, "tags"),
                "repeat" => safeget(ls, "repeat_col"),
                "tags" => safeget(ls, "tags"),
                "notes" => safeget(ls, "comments"),
                "tags" => safeget(ls, "tags"),
                "updated_at" => nothing,
            )
            updated_at = safeget(ls, "updated_at")
            if !isnothing(updated_at)
                try
                    d["updated_at"] = Dates.datetime2unix(
                        Dates.DateTime(
                            updated_at,
                            Dates.dateformat"yyyy-mm-ddTHH:MM:SS+00:00",
                        ),
                    )
                catch
                end
            end
            push!(entries, d)
        end
        if "next" in keys(json["paging"])
            url = json["paging"]["next"]
        else
            break
        end
        iter += 1
        if iter == max_iters
            logerror("too many pages $url")
            return nothing
        end
    end
    Dict("data" => entries)
end

function should_retry_malweb(r)
    if HTTP.iserror(r) && r.status ∉ [403, 404]
        # 403 error if list is private
        # 404 error if invalid url
        logerror("status code $(r.status) for $(r.request.url)")
        return true
    end
    if isempty(r.body)
        logerror("received empty response for $(r.request.url)")
        return true
    end
    false
end

function mal_get_username(s::Session, auth_token::String, userid::String)
    url = "https://myanimelist.net/comments.php?id=$userid"
    r = request(s, "GET", url, Dict{String,String}(), UInt8[], should_retry_malweb)
    if r.status >= 400
        logerror("status code $(r.status) for $url")
        return nothing
    end
    for m in eachmatch(r"/profile/([^\"/%]+)\"", String(r.body))
        return unescape(only(m.captures))
    end
    # it's expected that some userids are not bound to a user
    return nothing
end

function mal_get_media(s::Session, auth_token::String, medium::String, itemid::String)
    url = "https://myanimelist.net/$medium/$itemid"
    r = request(s, "GET", url, Dict{String,String}(), UInt8[], should_retry_malweb)
    if r.status >= 400
        logerror("status code $(r.status) for $url")
        return nothing
    end
    text = r.body
    details = mal_get_media_details(text, medium, itemid)
    relations = mal_get_media_relations(text, medium, itemid)
    Dict("details" => details, "relations" => relations)
end

function mal_get_media_details(text::String, medium::String, itemid::String)
    function extract(
        anime_regex,
        manga_regex;
        capture = """(?s)(.*?)""",
        optional = false,
        multiple = false,
    )
        if (medium == "anime" && isnothing(anime_regex)) ||
           (medium == "manga" && isnothing(manga_regex))
            return nothing
        end
        get_regex(start, stop, capture) = Regex(start * capture * stop)
        if medium == "anime"
            regex = get_regex(anime_regex..., capture)
        elseif medium == "manga"
            regex = get_regex(manga_regex..., capture)
        else
            @assert false
        end
        matches = Set(only(m.captures) for m in eachmatch(regex, text))
        if optional && isempty(matches)
            return nothing
        end
        if multiple
            return [strip(unescape(x)) for x in matches]
        end
        strip(unescape(only(matches)))
    end

    function summary(x)
        if startswith(x, "No synopsis information has been added to this title")
            return nothing
        end
        x = replace(x, """<span itemprop="description">""" => "", """<br />\r""" => "\n")
        x = replace(x, r"\n+" => "\n")
        strip(x)
    end

    function mediatype(x)
        # unpack href        
        if '>' ∉ x
            return x
        end
        regex = Regex(">" * """(?s)(.*?)""" * "<")
        only([only(m.captures) for m in eachmatch(regex, x)])
    end

    function date(x, start)
        x = replace(x, r"\s+" => " ")
        x = split(x, " to ")
        if start
            return first(x)
        end
        if length(x) > 2
            logerror("invalid date $x")
            return nothing
        elseif length(x) == 2
            return x[end]
        else
            return nothing
        end
    end

    function season(x)
        if isnothing(x)
            return x
        end
        # unpack href
        if '>' ∉ x
            return x
        end
        regex = Regex(">" * """(?s)(.*?)""" * "<")
        only([only(m.captures) for m in eachmatch(regex, x)])
    end

    function studios(x)
        regex = Regex("title=\"" * """(?s)(.*?)""" * "\"")
        [only(m.captures) for m in eachmatch(regex, x)]
    end

    function source(x)
        if isnothing(x)
            return x
        end
        regex = Regex(">" * """(?s)(.*?)""" * "</a>")
        matches = [only(m.captures) for m in eachmatch(regex, x)]
        if isempty(matches)
            return nothing
        end
        strip(only(matches))
    end

    Dict(
        "version" => API_VERSION,
        "uid" => itemid,
        "title" => extract(
            (
                """<div class="h1-title"><div itemprop="name"><h1 class="title-name h1_bold_none"><strong>""",
                "</strong></h1>",
            ),
            (
                """<span class="h1-title"><span itemprop="name">""",
                """(?:</span>|<br><span class="title-english">)""",
            ),
        ),
        "alttitle" => extract(
            ("""<p class="title-english title-inherit">""", """</p>"""),
            ("""<span class="title-english">""", """</span>"""),
            optional = true,
        ),
        "summary" =>
            extract(
                ("""Synopsis</h2></div><p itemprop="description">""", """(?:</p>|</a>)"""),
                ("""Synopsis</h2><span itemprop="description">""", """(?:</span>|</a>)"""),
            ) |> summary,
        "type" =>
            extract(
                ("""<span class="dark_text">Type:</span>""", """</div>"""),
                ("""<span class="dark_text">Type:</span>""", """</div>"""),
            ) |> mediatype,
        "status" => extract(
            ("""<span class="dark_text">Status:</span>""", """</div>"""),
            ("""<span class="dark_text">Status:</span>""", """</div>"""),
        ),
        "episodes" => extract(
            ("""<span class="dark_text">Episodes:</span>""", """</div>"""),
            nothing,
        ),
        "duration" => extract(
            ("""<span class="dark_text">Duration:</span>""", """</div>"""),
            nothing,
        ),
        "num_chapters" =>
            extract(nothing, ("""<span id="totalChaps".*?>""", """</span>""")),
        "num_volumes" => extract(nothing, ("""<span id="totalVols".*?>""", """</span>""")),
        "start_date" =>
            extract(
                ("""<span class="dark_text">Aired:</span>""", """</div>"""),
                ("""<span class="dark_text">Published:</span>""", """</div>"""),
            ) |> x -> date(x, true),
        "end_date" =>
            extract(
                ("""<span class="dark_text">Aired:</span>""", """</div>"""),
                ("""<span class="dark_text">Published:</span>""", """</div>"""),
            ) |> x -> date(x, false),
        "season" =>
            extract(
                ("""<span class="dark_text">Premiered:</span>""", """</div>"""),
                nothing,
                optional = true,
            ) |> season,
        "genres" => extract(
            ("""/$medium/genre/.*?/""", "\""),
            ("""/$medium/genre/.*?/""", "\""),
            multiple = true,
        ),
        "studios" =>
            extract(
                ("""<span class="dark_text">Studios:</span>""", "</div>"),
                ("""<span class="dark_text">Serialization:</span>""", "</div>"),
            ) |> studios,
        "source" =>
            extract(
                ("""<span class="dark_text">Source:</span>""", "</div>"),
                ("""<span class="dark_text">Source:</span>""", "</div>"),
                optional = true,
            ) |> source,
        "main_picture" => extract(
            ("""<meta property="og:image" content=\"""", "\""),
            ("""<meta property="og:image" content=\"""", "\""),
        ),
    )
end

function mal_get_media_relations(text::String, medium::String, itemid::String)
    relation_types = Dict(
        "Sequel" => "SEQUEL",
        "Prequel" => "PREQUEL",
        "Alternative Setting" => "ALTERNATIVE_SETTING",
        "Alternative Version" => "ALTERNATIVE_VERSION",
        "Side Story" => "SIDE_STORY",
        "Summary" => "SUMMARY",
        "Full Story" => "FULL_STORY",
        "Parent Story" => "PARENT_STORY",
        "Spin-Off" => "SPIN_OFF",
        "Adaptation" => "ADAPTATION",
        "Character" => "CHARACTER",
        "Other" => "OTHER",
    )
    records = Set()
    related_entries_section = false
    last_line = nothing
    last_relation = nothing
    last_href = nothing
    picture_section = true
    for match in eachmatch(r"([^<>]+|</?[^>]+>)", text)
        line = strip(match.match)
        prev_line = last_line
        cur_line = line
        last_line = line
        if line == "Related Entries"
            related_entries_section = true
            continue
        end
        if !related_entries_section
            continue
        end
        if line == """<td class="pb24">"""
            if !isnothing(last_href)
                logerror("did not finish parsing $last_href for $medium $itemid")
            end
            return collect(records)
        end
        if prev_line == """<div class="relation">"""
            line = strip(first(split(line, "\n")))
            last_relation = get(relation_types, line, nothing)
            if isnothing(last_relation)
                logerror("could not parse relation $line for $medium $itemid")
                continue
            end
            continue
        end
        if prev_line == """<td valign="top" class="ar fw-n borderClass nowrap">"""
            picture_section = false
            line = line[1:end-1] # strip trailing colon
            last_relation = get(relation_types, line, nothing)
            if isnothing(last_relation)
                logerror("could not parse relation $line for $medium $itemid")
                continue
            end
            continue
        end
        for m in
            eachmatch(r"""<a href="https://myanimelist.net/(manga|anime)/([0-9]+)/""", line)
            if picture_section
                if isnothing(last_href)
                    last_href = line
                    continue
                elseif last_href == line
                    last_href = nothing
                else
                    logerror("unexpected href $line for $medium $itemid")
                    last_href = nothing
                    continue
                end
            end
            if isnothing(last_relation)
                logerror("could not find relation for $line for $medium $itemid")
                continue
            end
            m_medium, m_itemid = m.captures
            d = Dict(
                "version" => API_VERSION,
                "relation" => last_relation,
                "source_id" => itemid,
                "source_medium" => medium,
                "target_id" => m_itemid,
                "target_medium" => m_medium,
            )
            push!(records, d)
            continue
        end
    end
    logerror("could not parse relations $medium $itemid")
    collect(records)
end;

# Anilist

In [4]:
function should_retry_anilist(r)
    if HTTP.iserror(r) && r.status ∉ [403, 404]
        logerror("status code $(r.status) for $(r.request.url)")
        return true
    end
    false
end

function anilist_date(x)
    function safeget(x, key, default)
        y = get(x, key, default)
        if isnothing(y)
            return default
        end
        y
    end
    string(
        safeget(x, "year", ""),
        "-",
        safeget(x, "month", ""),
        "-",
        safeget(x, "date", safeget(x, "day", "")),
    )
end

function anilist_get_list(s::Session, resource::String, userid::String)
    if resource == "animelist"
        mediatype = "ANIME"
    elseif resource == "mangalist"
        mediatype = "MANGA"
    else
        @assert false
    end
    headers = Dict("Content-Type" => "application/json")
    url = "https://graphql.anilist.co"
    entries = Dict()
    max_chunks = 1000
    chunk = 1
    has_next_chunk = true
    while has_next_chunk && chunk < max_chunks
        query = """
        query (\$userID: Int, \$MEDIA: MediaType, \$chunk: Int) {
            MediaListCollection (userId: \$userID, type: \$MEDIA, chunk: \$chunk) {
                hasNextChunk
                user {
                    name
                }        
                lists {
                    name
                    isCustomList
                    entries {
                        status
                        score(format: POINT_10_DECIMAL)
                        progress
                        progressVolumes
                        repeat
                        priority
                        notes
                        startedAt {
                            year
                            month
                            day
                        }
                        completedAt {
                            year
                            month
                            day
                        }
                        updatedAt
                        createdAt
                        media {
                            id
                            idMal
                        }
                    }
                }
            }
        }
        """
        variables = Dict("userID" => userid, "MEDIA" => mediatype, "chunk" => chunk)
        r = request(
            s,
            "POST",
            url,
            headers,
            Vector{UInt8}(JSON.json(Dict("query" => query, "variables" => variables))),
            should_retry_anilist,
        )
        if r.status >= 400
            logerror("received status $(r.status) for $url")
            return nothing
        end
        json = JSON.parse(String(r.body))
        username = json["data"]["MediaListCollection"]["user"]["name"]
        for x in json["data"]["MediaListCollection"]["lists"]
            for entry in x["entries"]
                key = entry["media"]["id"]
                if key ∉ keys(entries)
                    entries[key] = Dict(
                        "version" => API_VERSION,
                        "userid" => userid,
                        "username" => username,
                        "anilistid" => entry["media"]["id"],
                        "malid" => entry["media"]["idMal"],
                        "status" => entry["status"],
                        "score" => entry["score"],
                        "progress" => entry["progress"],
                        "progress_volumes" => entry["progressVolumes"],
                        "repeat" => entry["repeat"],
                        "priority" => entry["priority"],
                        "notes" => entry["notes"],
                        "listnames" => String[],
                        "started_at" => anilist_date(entry["startedAt"]),
                        "completed_at" => anilist_date(entry["completedAt"]),
                        "updated_at" => entry["updatedAt"],
                        "created_at" => entry["createdAt"],
                    )
                end
                if x["isCustomList"]
                    push!(entries[key]["listnames"], x["name"])
                end
            end
        end
        has_next_chunk = json["data"]["MediaListCollection"]["hasNextChunk"]
        chunk += 1
    end
    if chunk == max_chunks
        logerror("too many chunks $url")
        return nothing
    end
    Dict("data" => collect(values(entries)))
end

function anilist_get_userid(s::Session, username::String)
    url = "https://graphql.anilist.co"
    query = "query (\$username: String) { User (name: \$username) { id } }"
    variables = Dict("username" => username)
    headers = Dict("Content-Type" => "application/json")
    r = request(
        s,
        "POST",
        url,
        headers,
        Vector{UInt8}(JSON.json(Dict("query" => query, "variables" => variables))),
        should_retry_anilist,
    )
    if r.status >= 400
        logerror("status code $(r.status) for $url")
        return nothing
    end
    JSON.parse(String(r.body))["data"]["User"]["id"]
end

function anilist_get_media(s::Session, resource::String, itemid::String)
    if resource == "anime"
        mediatype = "ANIME"
    elseif resource == "manga"
        mediatype = "MANGA"
    else
        @assert false
    end
    url = "https://graphql.anilist.co"
    query = """
    query (\$id: Int, \$MEDIA: MediaType)
    {
        Media (id: \$id, type:\$MEDIA) {
            id,
            idMal,
            title {
                romaji
                english
            },
            format,
            description,
            genres,
            startDate {
                year
                month
                day
            },
            endDate {
                year
                month
                day
            },
            seasonYear,
            season,
            episodes,
            duration,
            chapters,
            volumes,
            status(version: 2),
            studios {
                edges {
                    node {
                        name
                    }
                }
            }
            type,
            source(version: 3),
            coverImage {
                medium
                large
                extraLarge
            }
            bannerImage
            relations {
                edges {
                    node {
                        id
                        type
                    }
                    relationType(version: 2),
                }
            }
        }
    }"""
    variables = Dict("id" => itemid, "MEDIA" => mediatype)
    headers = Dict("Content-Type" => "application/json")
    r = request(
        s,
        "POST",
        url,
        headers,
        Vector{UInt8}(JSON.json(Dict("query" => query, "variables" => variables))),
        should_retry_anilist,
    )
    if r.status >= 400
        logerror("received status $(r.status) for $url")
        return nothing
    end
    safeget(x::Dict, k::String) = get(x, k, nothing)
    json = JSON.parse(String(r.body))
    details = Dict(
        "version" => API_VERSION,
        "anilistid" => json["data"]["Media"]["id"],
        "malid" => safeget(json["data"]["Media"], "idMal"),
        "title" => safeget(json["data"]["Media"]["title"], "romaji"),
        "alttitle" => safeget(json["data"]["Media"]["title"], "english"),
        "type" => safeget(json["data"]["Media"], "format"),
        "summary" => safeget(json["data"]["Media"], "description"),
        "genres" => safeget(json["data"]["Media"], "genres"),
        "startdate" => anilist_date(safeget(json["data"]["Media"], "startDate")),
        "enddate" => anilist_date(safeget(json["data"]["Media"], "endDate")),
        "season" => string(
            get(json["data"]["Media"], "seasonYear", ""),
            " ",
            get(json["data"]["Media"], "season", ""),
        ),
        "episodes" => safeget(json["data"]["Media"], "episodes"),
        "duration" => safeget(json["data"]["Media"], "duration"),
        "chapters" => safeget(json["data"]["Media"], "chapters"),
        "volumes" => safeget(json["data"]["Media"], "volumes"),
        "status" => safeget(json["data"]["Media"], "status"),
        "studios" => join([
            x["node"]["name"] for x in get(json["data"]["Media"]["studios"], "edges", [])
        ],),
        "source" => get(json["data"]["Media"], "source", nothing),
    )
    relations = []
    for e in get(json["data"]["Media"]["relations"], "edges", [])
        d = Dict(
            "version" => API_VERSION,
            "relation" => e["relationType"],
            "source_id" => json["data"]["Media"]["id"],
            "source_media" => json["data"]["Media"]["type"],
            "target_id" => e["node"]["id"],
            "target_media" => e["node"]["type"],
        )
        push!(relations, d)
    end
    Dict("details" => details, "relations" => relations)
end;

# Kitsu

In [14]:
function should_retry_kitsu(r)
    if HTTP.iserror(r) && r.status ∉ [403, 404]
        logerror("status code $(r.status) for $(r.request.url)")
        return true
    end
    false
end

function kitsu_get_token(s::Session, credentials::String)
    username, password = split(credentials, " ")
    url = "https://kitsu.io/api/oauth/token"
    headers = Dict("Content-Type" => "application/json")
    body = JSON.json(
        Dict("grant_type" => "password", "username" => username, "password" => password),
    )
    r = request(s, "POST", url, headers, Vector{UInt8}(body), should_retry_kitsu)
    if r.status >= 400
        logerror("could not generate Kitsu token, received code $(r.status)")
        return nothing
    end
    response_data = JSON.parse(String(r.body))
    token = response_data["access_token"]
    expires_in = response_data["expires_in"]
    expiry_time = Dates.datetime2unix(Dates.now()) + expires_in
    Dict("token" => token, "expiry_time" => expiry_time)
end;

function kitsu_get_userid(s::Session, auth_token::String, username::String)
    url = "https://kitsu.io/api/edge/users?filter[slug]=$username"
    headers = Dict("Authorization" => "Bearer $auth_token")
    r = request(s, "GET", url, headers, UInt8[], should_retry_kitsu)
    if r.status >= 400
        logerror("received status $(r.status) for $url")
        return nothing
    end
    json = JSON.parse(r.body)["data"]
    println(json)
    if length(json) != 1
        logerror("found $(length(json)) != 1 users for $url")
        return nothing
    end
    only(json)["id"]
end;

In [11]:
# @time a = fetch(
#     "s1",
#     "brd.superproxy.io:22225:brd-customer-hl_bc25f36d-zone-shared-ip-205.237.95.167:72lon8836hbd",
#     "brightdata",
#     4.0,
#     "",
#     "kitsu",
#     "token",
#     "kebon69243@ippals.com xpTsfKyGRVbmeHzF",
# )

  0.836800 seconds (91.38 k allocations: 6.676 MiB, 3.24% compilation time)


Dict{String, Any} with 2 entries:
  "token"       => "t29V_vjj60GFSav7VxjoG6fYagPN62s2AJ_7cq4gpB4"
  "expiry_time" => 1.73319e9

In [15]:
# @time b = fetch(
#     "session2",
#     "brd.superproxy.io:22225:brd-customer-hl_bc25f36d-zone-shared-ip-205.237.95.167:72lon8836hbd",
#     "brightdata",
#     4.0,
#     "t29V_vjj60GFSav7VxjoG6fYagPN62s2AJ_7cq4gpB4",
#     "kitsu",
#     "userid",
#     "Fro116",
# )

Any[]
2024-11-25T04:22:36.856 [ERROR] found 0 != 1 users for https://kitsu.io/api/edge/users?filter[slug]=Fro116
  0.399666 seconds (714 allocations: 151.508 KiB)


In [33]:
# url = "https://kitsu.app/api/oauth/token"
# headers = Dict("Content-Type" => "application/json")
# body = JSON.json(
#     Dict(
#         "grant_type" => "password",
#         "username" => "kebon69243@ippals.com",
#         "password" => "xpTsfKyGRVbmeHzF",
#     ),
# )
# r = HTTP.request("POST", url, headers, Vector{UInt8}(body); status_exception = false)

HTTP.Messages.Response:
"""
HTTP/1.1 403 Forbidden
Date: Mon, 25 Nov 2024 09:11:01 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Accept-CH: Sec-CH-UA-Bitness, Sec-CH-UA-Arch, Sec-CH-UA-Full-Version, Sec-CH-UA-Mobile, Sec-CH-UA-Model, Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Platform, Sec-CH-UA, UA-Bitness, UA-Arch, UA-Full-Version, UA-Mobile, UA-Model, UA-Platform-Version, UA-Platform, UA
Critical-CH: Sec-CH-UA-Bitness, Sec-CH-UA-Arch, Sec-CH-UA-Full-Version, Sec-CH-UA-Mobile, Sec-CH-UA-Model, Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Platform, Sec-CH-UA, UA-Bitness, UA-Arch, UA-Full-Version, UA-Mobile, UA-Model, UA-Platform-Version, UA-Platform, UA
Cross-Origin-Embedder-Policy: require-corp
Cross-Origin-Opener-Policy: same-origin
Cross-Origin-Resource-Policy: same-origin
Origin-Agent-Cluster: ?1
Permissions-Policy: accelerometer=(),autoplay=(),browsing-topics=(),camera=(),clipboard-

In [46]:
@time Threads.@threads for i = 1:5
    body = Dict(
        "url" => "http://www.google.com",
        "method" => "GET",
        "headers" => Dict(),
        "body" => nothing,
        "proxyurl" => nothing,
        "sessionid" => 1,
    )
    r = HTTP.request(
        "POST",
        "http://localhost:8000/proxy",
        Dict("Content-Type" => "application/json"),
        Vector{UInt8}(JSON.json(body));
        status_exception = false,
    )
end

  0.267854 seconds (51.44 k allocations: 4.561 MiB, 167.47% compilation time)


In [36]:
JSON.parse(String(copy(r.body)))["content"]

"<!doctype html><html itemscope=\"\" itemtype=\"http://schema.org/WebPage\" lang=\"en\"><head><meta charset=\"UTF-8\"><meta content=\"origin\" name=\"referrer\"><meta content=\"/images/branding/googleg/1x/googleg_standard_color_128dp.png\" itemprop=\"image\"><title>Google</title><scrip"[93m[1m ⋯ 204391 bytes ⋯ [22m[39m"5rO\\x22 aria-level\\x3d\\x221\\x22 role\\x3d\\x22heading\\x22\\x3eChoose what you\\u2019re giving feedback on\\x3c/div\\x3e');})();(function(){google.drty&&google.drty(undefined,true);})();});if (!google.stvsc){google.drty && google.drty(undefined,true);}\n</script></body></html>"

In [22]:
# t29V_vjj60GFSav7VxjoG6fYagPN62s2AJ_7cq4gpB4

In [37]:

body = Dict(
    "url" => "https://kitsu.app/api/oauth/token",
    "method" => "POST",
    "headers" => nothing,
    "body" => Dict(
        "grant_type" => "password",
        "username" => "kebon69243@ippals.com",
        "password" => "xpTsfKyGRVbmeHzF",
    ),
    "proxyurl" => nothing,
    "sessionid" => 2,
)
r = HTTP.request(
    "POST",
    "http://localhost:8000/proxy",
    Dict("Content-Type" => "application/json"),
    Vector{UInt8}(JSON.json(body));
    status_exception = false,
)

HTTP.Messages.Response:
"""
HTTP/1.1 200 OK
date: Mon, 25 Nov 2024 16:03:27 GMT
server: uvicorn
content-length: 1580
content-type: application/json

{"status_code":200,"headers":{"date":"Mon, 25 Nov 2024 16:03:28 GMT","content-type":"application/json; charset=utf-8","alt-svc":"h3=\":443\"; ma=86400","cache-control":"no-store","etag":"W/\"1dea22060a170c3e3f3301304dd45ab7\"","referrer-policy":"strict-origin-when-cross-origin","strict-transport-security":"max-age=63072000; includeSubDomains","vary":"Origin","x-content-type-options":"nosniff","x-download-options":"noopen","x-frame-options":"SAMEORIGIN","x-permitted-cross-domain-policies":"none","x-request-id":"6db05128-e5c6-4612-b68f-d05683bd5e93","x-runtime":"0.368865","x-xss-protection":"1; mode=block","cf-cache-status":"DYNAMIC","server-timing":"cfCacheStatus;desc=\"DYNAMIC\", cfL4;desc=\"?proto=TCP&rtt=10699&sent=7&recv=9&lost=0&retrans=0&sent_bytes=3988&recv_bytes=2586&delivery_rate=452547&cwnd=232&unsent_bytes=0&cid=78c65ee2745

In [40]:
JSON.parse(JSON.parse(String(copy(r.body)))["content"])

Dict{String, Any} with 6 entries:
  "refresh_token" => "6GjyJQ1KmrMsmf2i1lZSAO90m4SrZoR4kqHA4NTtGx8"
  "created_at"    => 1730612355
  "access_token"  => "t29V_vjj60GFSav7VxjoG6fYagPN62s2AJ_7cq4gpB4"
  "token_type"    => "Bearer"
  "scope"         => "public"
  "expires_in"    => 653747

In [5]:
# url = "https://kitsu.io/api/oauth/token"
# headers = Dict("Content-Type" => "application/json")
# body = JSON.json(
#     Dict(
#         "grant_type" => "password",
#         "username" => "kebon69243@ippals.com",
#         "password" => "xpTsfKyGRVbmeHzF",
#     ),
# )
# r = HTTP.request(
#     "POST",
#     url,
#     chrome_headers ∪ headers,
#     Vector{UInt8}(body);
#     status_exception = false,
# )

HTTP.Messages.Response:
"""
HTTP/1.1 403 Forbidden
Server: cloudflare
Date: Mon, 25 Nov 2024 10:20:05 GMT
Content-Type: text/html
Content-Length: 553
Connection: keep-alive
CF-RAY: 8e80e776ff8a438c-EWR

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
<!-- a padding to disable MSIE and Chrome friendly error page -->
<!-- a padding to disable MSIE and Chrome friendly error page -->
<!-- a padding to disable MSIE and Chrome friendly error page -->
<!-- a padding to disable MSIE and Chrome friendly error page -->
<!-- a padding to disable MSIE and Chrome friendly error page -->
<!-- a padding to disable MSIE and Chrome friendly error page -->
"""

In [35]:
# import CodecZstd
# unpack(d::Vector{UInt8}) =
#     JSON.parse(String(CodecZstd.transcode(CodecZstd.ZstdDecompressor, copy(d))))

unpack (generic function with 1 method)

In [36]:
# unpack(r.body)

Dict{String, Any} with 6 entries:
  "refresh_token" => "6GjyJQ1KmrMsmf2i1lZSAO90m4SrZoR4kqHA4NTtGx8"
  "created_at"    => 1730612355
  "access_token"  => "t29V_vjj60GFSav7VxjoG6fYagPN62s2AJ_7cq4gpB4"
  "token_type"    => "Bearer"
  "scope"         => "public"
  "expires_in"    => 675827