# Setup

In [None]:
import datetime
import re

In [None]:
API_PERIOD = 2
%run ApiSetup.ipynb

In [None]:
try:
    TOKEN_NUMBER
    with open(get_datapath(f"mal/mal_authentication/clientid.{TOKEN_NUMBER}.txt")) as f:
        MAL_ACCESS_TOKEN = f.readlines()[0].strip()
except:
    pass

In [None]:
def call_api(url):
    return call_api_internal(
        url,
        "GET",
        "mal",
        headers={"X-MAL-CLIENT-ID": MAL_ACCESS_TOKEN},
    )

# Media lists

In [None]:
def get_user_media_list(username, media):
    media_lists = []
    more_pages = True
    url = (
        "https://api.myanimelist.net/v2/users/"
        f"{username}/{media}list?limit=1000&fields=list_status&nsfw=true"
    )
    while more_pages:
        response = call_api(url)
        if response.status_code in [403, 404]:
            # 403: This can occur if the user has privated their list
            # 404: This can occur if the user deleted their account
            logger.warning(f"Error {response} received when handling {url}")
            return pd.DataFrame(), False
        if not response.ok:
            logger.warning(f"Error {response} received when handling {url}")
            return pd.DataFrame(), False

        json = response.json()
        media_lists.append(process_media_list_json(json, media))
        more_pages = "next" in json["paging"]
        if more_pages:
            url = json["paging"]["next"]
    user_media_list = pd.concat(media_lists, ignore_index=True)
    user_media_list["username"] = username
    return user_media_list, True

In [None]:
def process_media_list_json(json, media):
    entries = [parse_json_node(x, media) for x in json["data"]]
    if entries:
        return pd.concat(entries, ignore_index=True)
    else:
        return pd.DataFrame()


def parse_json_node(x, media):
    ls = x["list_status"]
    progress_col = {
        "anime": "num_episodes_watched",
        "manga": "num_chapters_read",
    }
    repeat_col = {
        "anime": "is_rewatching",
        "manga": "is_rereading",
    }
    repeat_count_col = {
        "anime": "num_times_rewatched",
        "manga": "num_times_reread",
    }
    repeat_value_col = {
        "anime": "rewatch_value",
        "manga": "reread_value",
    }
    entry = pd.DataFrame.from_dict(
        {
            "uid": [x["node"]["id"]],
            "status": [ls.get("status", "")],
            "score": [ls.get("score", "")],
            "progress": [ls.get(progress_col[media], "")],
            "progress_volumes": [ls.get("num_volumes_read", "")],
            "started_at": [ls.get("start_date", "")],
            "completed_at": [ls.get("finish_date", "")],
            "priority": [ls.get("priority", "")],
            "repeat": [ls.get(repeat_col[media], False)],
            "repeat_count": [ls.get(repeat_count_col[media], "")],
            "repeat_value": [ls.get(repeat_value_col[media], "")],
            "tags": [" ".join([sanitize_string(x) for x in ls.get("tags", [])])],
            "notes": [sanitize_string(ls.get("comments", ""))],
            "updated_at": [process_timestamp(ls.get("updated_at", None))],
        }
    )
    return entry

In [None]:
def process_timestamp(time):
    if time is None:
        return 0
    try:
        return to_unix_time(time, "%Y-%m-%dT%H:%M:%S+00:00")
    except:
        return 0

# Media

In [None]:
def get_media_fields(media):
    if media == "anime":
        media_fields = [
            "num_episodes",
            "related_anime",
            "average_episode_duration",
            "source",
            "studios",
            "start_season",
        ]
    elif media == "manga":
        media_fields = ["num_volumes", "num_chapters", "related_manga", "authors"]
    else:
        assert False
    return [
        "id",
        "title",
        "main_picture",
        "alternative_titles",
        "start_date",
        "end_date",
        "synopsis",
        "num_list_users",
        "num_scoring_users",
        "nsfw",
        "genres",
        "media_type",
        "status",
        "recommendations",
    ] + media_fields

In [None]:
def process_media_details_json(json, media):
    def get_key(key):
        return [json[key] if key in json else "NaN"]

    special_cols = {
        f"related_{media}": [
            [
                {f"{media}_id": x["node"]["id"], "relation": x["relation_type"]}
                for x in json[f"related_{media}"]
            ]
        ],
        "recommendations": [
            [
                {
                    f"{media}_id": x["node"]["id"],
                    "num_recommendations": x["num_recommendations"],
                }
                for x in json["recommendations"]
            ]
        ],
        "genres": [[x["name"] for x in json["genres"]] if "genres" in json else "[]"],
    }
    regular_cols = {
        x: get_key(x) for x in get_media_fields(media) if x not in special_cols
    }
    regular_cols.update(special_cols)
    df = pd.DataFrame.from_dict(regular_cols)
    df = df.rename({"id": f"{media}_id", "media_type": "medium"}, axis=1)
    return df


def get_media_details(mediaid, media):
    url = (
        f"https://api.myanimelist.net/v2/{media}/{mediaid}"
        f'?fields={",".join(get_media_fields(media))}&nsfw=true'
    )
    response = call_api(url)
    if response.status_code == 404:
        logger.warning(f"Received error {response.status_code} while accessing {url}")
        return pd.DataFrame()
    try:
        response.raise_for_status()
    except Exception as e:
        logger.warning(f"Received error {str(e)} while accessing {url}")
        return pd.DataFrame()
    df = process_media_details_json(response.json(), media)
    return df

# Media relations

In [None]:
@sleep_and_retry
@limits(calls=1, period=4 * API_PERIOD_MULT)
def call_web_api(url):
    return call_api_internal(url, "GET", "web", extra_error_codes=[403])

In [None]:
def process_media_relations_response(response, uid, media):
    relation_types = {
        "Sequel:": "SEQUEL",
        "Prequel:": "PREQUEL",
        "Alternative setting:": "ALTERNATIVE",
        "Alternative version:": "ALTERNATIVE",
        "Side story:": "SIDE_STORY",
        "Summary:": "SUMMARY",
        "Full story:": "FULL_STORY",
        "Parent story:": "PARENT_STORY",
        "Spin-off:": "SPIN_OFF",
        "Adaptation:": "ADAPTATION",
        "Character:": "CHARACTER",
        "Other:": "OTHER",
    }

    records = []
    lines = re.split("<|>", response.text)
    starting_line = f"Related {media.capitalize()}"
    if starting_line not in lines:
        return []
    start = lines.index(starting_line)
    for line in lines[start:]:
        if line in relation_types:
            rtype = relation_types[line]
        elif "href" in line:
            l = line
            for target_media in ["anime", "manga"]:
                for target_id in re.findall(rf"/{target_media}/[0-9]+", l):
                    target_id = int(target_id.split("/")[-1])
                    if target_id == uid:
                        return records
                    records.append(
                        (rtype, uid, media.upper(), target_id, target_media.upper())
                    )
    assert False, f"could not parse {media} relations for {uid}"


def get_media_relations(uid, media):
    url = f"https://myanimelist.net/{media}/{uid}"
    response = call_web_api(url)
    try:
        response.raise_for_status()
    except Exception as e:
        logger.warning(f"Received error {str(e)} while accessing {url}")
        return pd.DataFrame()
    records = process_media_relations_response(response, uid, media)
    return pd.DataFrame.from_records(
        records,
        columns=[
            "relation",
            "source_id",
            "source_media",
            "target_id",
            "target_media",
        ],
    )