# Fetching a user anime-list
* Given a user, we get an up-to-date version of their anime-list
* Supports reading public anime-lists from MyAnimeList and AniList

In [1]:
import json
import os
import pickle
import time
import xml.etree.ElementTree as ET

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/recommendations"
os.chdir(data_path)
with open("recommendee_username.txt") as f:
    username = f.read()
with open("anime_list_source.txt") as f:
    source = f.read()
assert source in ["MAL", "MALOfficialAPI", "AniList", "XML"]

In [3]:
data_path = os.path.join(data_path, username)
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [4]:
print(f"Fetching list for {username} from {source}")

Fetching list for Fro116 from MAL


In [5]:
def import_from_mal_official_api(username):
    token = json.load(open("../../mal/mal_authentication/token.json", "r"))

    @sleep_and_retry
    @limits(calls=1, period=0.75)
    def call_api(url):
        try:
            response = requests.get(
                url, headers={"Authorization": f'Bearer {token["access_token"]}'}
            )
            if response.status_code == 500:
                # This can occur if MAL servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            retry_timeout = 60
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
            )
            time.sleep(retry_timeout)
            return call_api(url)
        return response

    def process_json(json):
        df = pd.DataFrame.from_records(
            [
                (x["node"]["id"], x["list_status"]["score"], x["list_status"]["status"])
                for x in json["data"]
            ],
            columns=["anime_id", "my_score", "status"],
        )
        return df.loc[lambda x: x["status"] != "plan_to_watch"].drop("status", axis=1)

    def get_user_anime_list(username):
        anime_lists = []
        more_pages = True
        url = f"https://api.myanimelist.net/v2/users/{username}/animelist?limit=1000&fields=list_status"
        while more_pages:
            response = call_api(url)

            if response.status_code == 403 or response.status_code == 404:
                # 403: This can occur if the user privated their list
                # 404: This can occur if the user deleted their account
                return pd.DataFrame(), False

            response.raise_for_status()
            json = response.json()
            anime_lists.append(process_json(json))

            more_pages = "next" in json["paging"]
            if more_pages:
                url = json["paging"]["next"]
        user_anime_list = pd.concat(anime_lists, ignore_index=True)
        user_anime_list["username"] = username
        return user_anime_list, True

    df, ret = get_user_anime_list(username)
    if not ret:
        raise Exception(f"Could not resolve list for {username}")
    return df

In [6]:
def import_from_anilist(username):
    @sleep_and_retry
    @limits(calls=1, period=1)
    def call_api(url, json):
        try:
            response = requests.post(url, json=json)
            if (response.status_code == 500) or (response.status_code == 503):
                # This can occur if servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            timeout = 60
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
            )
            time.sleep(timeout)
            return call_api(url)
        return response

    url = "https://graphql.anilist.co"

    def process_json(json):
        records = [
            (entry["media"]["idMal"], entry["score"], entry["status"])
            for x in json["data"]["MediaListCollection"]["lists"]
            for entry in x["entries"]
        ]
        return pd.DataFrame.from_records(
            records, columns=["anime_id", "my_score", "status"]
        )

    def get_user_id(username):
        query = "query ($userName: String) { User (search: $userName) { id } }"
        variables = {"userName": "'" + username + "'"}
        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        return response.json()["data"]["User"]["id"]

    def get_anime_list(username):
        userid = get_user_id(username)

        query = """
        query ($userID: Int, $MEDIA: MediaType) {
        MediaListCollection (userId: $userID, type: $MEDIA) { 
            lists {
                entries
                {
                    status
                    score
                    media
                    {
                        idMal
                    }
                }
            }
        }
        }
        """
        variables = {"userID": str(userid), "MEDIA": "ANIME"}

        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        anime_list = process_json(response.json())
        anime_list = anime_list.loc[lambda x: x["status"] != "PLANNING"].drop(
            "status", axis=1
        )
        # deduplicate shows that appear on multiple lists
        anime_list = anime_list.groupby("anime_id").mean().reset_index()
        anime_list["username"] = username
        return anime_list

    return get_anime_list(username)

In [7]:
def parse_xml_entry(root):
    score = 0
    mal_id = 0
    status = ""
    for child in root:
        if child.tag == "series_animedb_id":
            mal_id = int(child.text)
        if child.tag == "my_score":
            score = float(child.text)
        if child.tag == "my_status":
            status = child.text
    return mal_id, score, status


def import_from_xml(username):
    tree = ET.parse("../anime_list.xml")
    root = tree.getroot()
    assert (root[0][1].tag == "user_name") and (root[0][1].text == username)
    records = []
    for child in root:
        if child.tag == "anime":
            score, mal_id, status = parse_xml_entry(child)
            if status != "Plan to Watch":
                records.append((score, mal_id, username))
    return pd.DataFrame.from_records(
        records, columns=["anime_id", "my_score", "username"]
    )

In [8]:
import_fns = {
    "MAL": import_from_mal_official_api,  # import_from_mal,
    "MALOfficialAPI": import_from_mal_official_api,
    "AniList": import_from_anilist,
    "XML": import_from_xml,
}
if not source in import_fns:
    raise Exception(f"Unsupported anime-list source {source}")
df = import_fns[source](username)

In [9]:
anime_to_uid = pd.read_csv("../../processed_data/anime_to_uid.csv")

In [10]:
df = df.merge(anime_to_uid, on="anime_id")

In [11]:
username_to_uid = pd.read_csv(
    "../../processed_data/username_to_uid.csv", keep_default_na=False
)

In [12]:
user_uid = max(username_to_uid["uid"]) + 1

In [13]:
df["username"] = user_uid
df["anime_id"] = df["uid"]

In [14]:
anime_lists = df[["username", "anime_id", "my_score"]]
implicit_lists = anime_lists.loc[lambda x: x["my_score"] == 0].reset_index(drop=True)
implicit_lists["my_score"] = 1
explicit_lists = anime_lists.loc[lambda x: x["my_score"] != 0].reset_index(drop=True)

In [15]:
explicit_lists.to_csv("user_explicit_lists.csv", index=False)
implicit_lists.to_csv("user_implicit_lists.csv", index=False)