# Importing datasets
* Converts data obtained form the MyAnimeList API into a shared format
* Note that loading and saving the datasets may take several minutes

In [1]:
import logging
import os

import pandas as pd
from tqdm import tqdm

In [2]:
source_dir = "../../data/mal/"

In [3]:
outdir = "../../data/raw_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [4]:
# logging
logger = logging.getLogger("MyAnimeList")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Get anime facts

In [5]:
anime = pd.read_csv(os.path.join(source_dir, "anime_facts/anime.csv"))

In [6]:
anime = anime.sort_values(by="anime_id")

In [7]:
anime.to_csv(os.path.join(outdir, "anime.csv"), index=False)

## Get User Lists

In [8]:
# Entries in user_anime_list.csv can be malformed if the notebook crashes in the
# middle of saving a file. This function removes any malformed lines.
def verify_user_anime_list_consistency():
    logger.info("Verifying consistency of existing entries in user_anime_list.csv")
    input_fn = os.path.join(source_dir, "user_anime_facts/user_anime_list.csv")
    output_fn = input_fn + "~"
    user_status = pd.read_csv(
        os.path.join(source_dir, "user_anime_facts/user_status.csv"),
        keep_default_na=False,
    )
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            successful_users = set(user_status.loc[lambda x: x["success"]]["username"])
            invalid_users = set()
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = "anime_id,my_score,username\n"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 3:
                    logger.warning(
                        f"Deleting malformed line in user_anime_list.csv {line} "
                    )
                    continue
                anime_id, score, username = fields
                if username not in successful_users:
                    if username not in invalid_users:
                        invalid_users.add(username)
                        logger.warning(
                            f"Deleting entries in user_anime_list.csv for "
                            f" unrecognized username {username}"
                        )
                    continue
                out_file.write(line)
    os.replace(output_fn, input_fn)


verify_user_anime_list_consistency()

MyAnimeList:INFO:2022-06-03 01:26:38: Verifying consistency of existing entries in user_anime_list.csv
230097674it [02:58, 1287180.26it/s]


In [9]:
anime_lists = pd.read_csv(
    os.path.join(source_dir, "user_anime_facts/user_anime_list.csv")
)

In [10]:
anime_lists = anime_lists[["username", "anime_id", "my_score"]]
implicit_lists = anime_lists.loc[lambda x: x["my_score"] == 0].reset_index(drop=True)
implicit_lists["my_score"] = 1
explicit_lists = anime_lists.loc[lambda x: x["my_score"] != 0].reset_index(drop=True)

In [11]:
explicit_lists.to_csv(os.path.join(outdir, "user_explicit_lists.csv"), index=False)
implicit_lists.to_csv(os.path.join(outdir, "user_implicit_lists.csv"), index=False)

## Write source

In [12]:
with open(os.path.join(outdir, "source.txt"), "w") as f:
    f.write("Dataset obtained using the MyAnimeList API")