# Fetching a user anime-list
* Given a user, we get an up-to-date version of their anime-list
* Then, we normalize it and store the normalized version
* Supports reading public anime-lists from MyAnimeList and AniList

In [1]:
# CHANGE THESE PARAMETERS
username = "taapaye"
source = "AniList" # MAL or AniList

In [2]:
import os
import pickle
import time

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [3]:
data_path = "../../data/recommendations"
if not os.path.exists(data_path):
    os.mkdir(data_path)
data_path = os.path.join(data_path, username)
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [4]:
def import_from_mal(username):
    @sleep_and_retry
    @limits(calls=1, period=4)
    def call_api(url):
        try:
            response = requests.get(url)
            if (response.status_code == 500) or (response.status_code == 503):
                # This can occur if MAL servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            timeout = 600
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
            )
            time.sleep(timeout)
            return call_api(url)
        return response

    def process_json(json):
        records = [(x["mal_id"], x["score"]) for x in json["anime"]]
        return pd.DataFrame.from_records(records, columns=["anime_id", "my_score"])

    def get_anime_list(username):
        pages = []

        max_items_per_page = 300  # property of the API
        more_pages = True
        page = 1

        # hack to allow tqdm profiling
        def generator():
            while more_pages:
                yield

        for _ in tqdm(generator()):
            response = call_api(
                f"https://api.jikan.moe/v3/user/{username}/animelist/all/{page}"
            )
            response.raise_for_status()

            paged_anime_list = process_json(response.json())
            pages.append(paged_anime_list)
            page += 1
            if len(paged_anime_list) < max_items_per_page:
                more_pages = False

        anime_list = pd.concat(pages, ignore_index=True)
        anime_list["username"] = username
        # remove unrated shows
        anime_list = anime_list.loc[lambda x: x["my_score"] != 0] 
        return anime_list[["username", "anime_id", "my_score"]]
    
    return get_anime_list(username)

In [5]:
def import_from_anilist(username):
    @sleep_and_retry
    @limits(calls=1, period=1)
    def call_api(url, json):
        try:
            response = requests.post(url, json=json)
            if (response.status_code == 500) or (response.status_code == 503):
                # This can occur if servers go down
                raise Exception(f"{response.status_code}")
        except Exception as e:
            timeout = 600
            print(
                f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
            )
            time.sleep(timeout)
            return call_api(url)
        return response

    url = "https://graphql.anilist.co"

    def process_json(json):
        records = [
            (entry["media"]["idMal"], entry["score"])
            for x in json["data"]["MediaListCollection"]["lists"]
            for entry in x["entries"]
        ]
        return pd.DataFrame.from_records(records, columns=["anime_id", "my_score"])

    def get_user_id(username):
        query = "query ($userName: String) { User (search: $userName) { id } }"
        variables = {"userName": "'" + username + "'"}
        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        return response.json()["data"]["User"]["id"]

    def get_anime_list(username):
        userid = get_user_id(username)

        query = """
        query ($userID: Int, $MEDIA: MediaType) {
        MediaListCollection (userId: $userID, type: $MEDIA) { 
            lists {
                entries
                {
                    status
                    score
                    media
                    {
                        idMal
                    }
                }
            }
        }
        }
        """
        variables = {"userID": str(userid), "MEDIA": "ANIME"}

        response = call_api(url, {"query": query, "variables": variables})
        response.raise_for_status()
        anime_list = process_json(response.json())
        anime_list['username'] = username
        # remove unrated shows
        anime_list = anime_list.loc[lambda x: x["my_score"] != 0]         
        return anime_list
        
    return get_anime_list(username)        

In [6]:
if source == "MAL":
    df = import_from_mal(username)
elif source == "AniList":
    df = import_from_anilist(username)
else:
    raise Exception(f"Unsupported anime-list source {source}")

In [7]:
df.sort_values(by='my_score')

Unnamed: 0,anime_id,my_score,username
31,36027,2.0,taapaye
387,5205,2.0,taapaye
88,20787,3.0,taapaye
51,23283,4.0,taapaye
106,8888,4.0,taapaye
...,...,...,...
485,9756,10.0,taapaye
494,9756,10.0,taapaye
232,9756,10.0,taapaye
515,9756,10.0,taapaye


In [8]:
anime_stats = pickle.load(open("../../processed_data/anime_stats.pkl", "rb"))
normalized_anime_stats = pickle.load(
    open("../../processed_data/normalized_anime_stats.pkl", "rb")
)

In [9]:
# see comments in NormalizeRatings.ipynb
df = df.merge(anime_stats, on="anime_id").merge(normalized_anime_stats, on="anime_id")
df["score"] = df["my_score"] - (df["anime_bias"] + df["my_score"].mean())
user_contrib = 0.5
anime_contrib = 0.5
df["score_var"] = (
    df["score"].var() * user_contrib + df["normalized_anime_var"] * anime_contrib
)
df[["username", "anime_id", "score", "score_var"]].to_pickle("user_anime_list.pkl")

In [10]:
# store the baseline predictor
blp = pd.DataFrame()
blp["blp"] = anime_stats["anime_bias"] + df["my_score"].mean()
blp.to_pickle("baseline_predictor.pkl")