# Fetching a user anime-list
* Given a user, we get an up-to-date version of their anime-list
* Supports reading public anime-lists from MyAnimeList and AniList

In [1]:
import json
import os
import pickle
import time

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/recommendations"
os.chdir(data_path)
with open("recommendee_username.txt") as f:
    username = f.read()
with open("anime_list_source.txt") as f:
    source = f.read()    
assert source in ["MAL", "MALOfficialAPI", "AniList"]

In [3]:
data_path = os.path.join(data_path, username)
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [4]:
def import_from_mal(username):
    @sleep_and_retry
    @limits(calls=1, period=4)
    def call_api(url):
        try:
            response = requests.get(url)
            if (response.status_code == 500) or (response.status_code == 503):
                # This can occur if MAL servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            timeout = 600
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
            )
            time.sleep(timeout)
            return call_api(url)
        return response

    def process_json(json):
        records = [(x["mal_id"], x["score"]) for x in json["anime"]]
        return pd.DataFrame.from_records(records, columns=["anime_id", "my_score"])

    def get_anime_list(username):
        pages = []

        max_items_per_page = 300  # property of the API
        more_pages = True
        page = 1

        # hack to allow tqdm profiling
        def generator():
            while more_pages:
                yield

        for _ in tqdm(generator()):
            response = call_api(
                f"https://api.jikan.moe/v3/user/{username}/animelist/all/{page}"
            )
            response.raise_for_status()

            paged_anime_list = process_json(response.json())
            pages.append(paged_anime_list)
            page += 1
            if len(paged_anime_list) < max_items_per_page:
                more_pages = False

        anime_list = pd.concat(pages, ignore_index=True)
        anime_list["username"] = username
        # remove unrated shows
        anime_list = anime_list.loc[lambda x: x["my_score"] != 0]
        return anime_list[["username", "anime_id", "my_score"]]

    return get_anime_list(username)

In [5]:
def import_from_mal_official_api(username):
    token = json.load(open("../../mal/mal_authentication/token.json", "r"))

    @sleep_and_retry
    @limits(calls=1, period=0.75)
    def call_api(url):
        try:
            response = requests.get(
                url, headers={"Authorization": f'Bearer {token["access_token"]}'}
            )
            if response.status_code == 500:
                # This can occur if MAL servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            retry_timeout = 3600
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
            )
            time.sleep(retry_timeout)
            return call_api(url)
        return response

    def process_json(json):
        return pd.DataFrame.from_records(
            [(x["node"]["id"], x["list_status"]["score"]) for x in json["data"]],
            columns=["anime_id", "my_score"],
        )

    def get_user_anime_list(username):
        anime_lists = []
        more_pages = True
        url = f"https://api.myanimelist.net/v2/users/{username}/animelist?limit=1000&fields=list_status"
        while more_pages:
            response = call_api(url)

            if response.status_code == 403 or response.status_code == 404:
                # 403: This can occur if the user privated their list
                # 404: This can occur if the user deleted their account
                return pd.DataFrame(), False

            response.raise_for_status()
            json = response.json()
            anime_lists.append(process_json(json))

            more_pages = "next" in json["paging"]
            if more_pages:
                url = json["paging"]["next"]
        user_anime_list = pd.concat(anime_lists, ignore_index=True)
        user_anime_list["username"] = username
        user_anime_list = user_anime_list.loc[lambda x: x["my_score"] != 0]
        return user_anime_list, True
    
    df, ret = get_user_anime_list(username)
    if not ret:
        raise Exception(f'Could not resolve list for {username}')
    return df

In [6]:
def import_from_anilist(username):
    @sleep_and_retry
    @limits(calls=1, period=1)
    def call_api(url, json):
        try:
            response = requests.post(url, json=json)
            if (response.status_code == 500) or (response.status_code == 503):
                # This can occur if servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            timeout = 600
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
            )
            time.sleep(timeout)
            return call_api(url)
        return response

    url = "https://graphql.anilist.co"

    def process_json(json):
        records = [
            (entry["media"]["idMal"], entry["score"])
            for x in json["data"]["MediaListCollection"]["lists"]
            for entry in x["entries"]
        ]
        return pd.DataFrame.from_records(records, columns=["anime_id", "my_score"])

    def get_user_id(username):
        query = "query ($userName: String) { User (search: $userName) { id } }"
        variables = {"userName": "'" + username + "'"}
        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        return response.json()["data"]["User"]["id"]

    def get_anime_list(username):
        userid = get_user_id(username)

        query = """
        query ($userID: Int, $MEDIA: MediaType) {
        MediaListCollection (userId: $userID, type: $MEDIA) { 
            lists {
                entries
                {
                    status
                    score
                    media
                    {
                        idMal
                    }
                }
            }
        }
        }
        """
        variables = {"userID": str(userid), "MEDIA": "ANIME"}

        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        anime_list = process_json(response.json())
        # remove unrated shows
        anime_list = anime_list.loc[lambda x: x["my_score"] != 0]
        # deduplicate shows that appear on multiple lists
        anime_list = anime_list.groupby("anime_id").mean().reset_index()
        anime_list["username"] = username
        return anime_list

    return get_anime_list(username)

In [7]:
import_fns = {'MAL': import_from_mal, 'MALOfficialAPI': import_from_mal_official_api, 'AniList': import_from_anilist}
if not source in import_fns:
    raise Exception(f"Unsupported anime-list source {source}")
df = import_fns[source](username)

2it [00:04,  2.21s/it]


In [8]:
anime_to_uid = pd.read_csv("../../processed_data/anime_to_uid.csv")

In [9]:
df = df.merge(anime_to_uid, on="anime_id")

In [10]:
username_to_uid = pd.read_csv("../../processed_data/username_to_uid.csv")

In [11]:
if username in list(username_to_uid["username"]):
    user_uid = username_to_uid.set_index("username").loc[username, "uid"]
else:
    user_uid = max(username_to_uid["uid"]) + 1

In [12]:
df["username"] = user_uid
df["anime_id"] = df["uid"]

In [13]:
df[["username", "anime_id", "my_score"]].to_csv("user_anime_list.csv", index=False)