# Importing datasets
* Converts data obtained form the MyAnimeList API into a shared format
* Note that loading and saving the datasets may take several minutes

In [None]:
import datetime
import logging
import os

import pandas as pd
from tqdm import tqdm

In [None]:
source_dir = "../../data/mal/"

In [None]:
outdir = "../../data/raw_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [None]:
# logging
logger = logging.getLogger("MyAnimeList")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Get anime facts

In [None]:
anime = pd.read_csv(os.path.join(source_dir, "anime_facts/anime.csv"))

In [None]:
anime = anime.sort_values(by="anime_id")

In [None]:
anime.to_csv(os.path.join(outdir, "anime.csv"), index=False)

## Get User Lists

In [None]:
# Entries in user_anime_list.csv can be malformed if the notebook crashes in the
# middle of saving a file. This function removes any malformed lines.
def verify_user_anime_list_consistency(input_fn):
    logger.info(f"Verifying consistency of entries in {input_fn}")
    output_fn = input_fn + "~"
    user_status = pd.read_csv(
        os.path.join(source_dir, "user_anime_facts/user_status.csv"),
        keep_default_na=False,
    )
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            successful_users = set(user_status.loc[lambda x: x["success"]]["username"])
            invalid_users = set()
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = "uid,status,score,num_episodes_watched,is_rewatching,start_date,finish_date,priority,num_times_rewatched,rewatch_value,updated_at,username\n"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 12:
                    logger.warning(
                        f"Deleting malformed line in user_anime_list.csv {line} "
                    )
                    continue
                username = fields[11]
                if username not in successful_users:
                    if username not in invalid_users:
                        invalid_users.add(username)
                        logger.warning(
                            f"Deleting entries in user_anime_list.csv for "
                            f" unrecognized username {username}"
                        )
                    continue
                out_file.write(line)
    os.replace(output_fn, input_fn)

In [None]:
def to_unix_time(date, fmt):
    return str(int(datetime.datetime.timestamp(datetime.datetime.strptime(date, fmt))))

In [None]:
def process_uid(uid):
    assert uid.isdigit()
    return uid


def process_status(status):
    if status == "completed":
        return "5"
    elif status == "watching":
        return "4"
    elif status == "on_hold":
        return "3"
    elif status == "dropped":
        return "2"
    elif status == "plan_to_watch":
        return "1"
    elif status == "":
        return "0"
    else:
        assert False


def process_score(score):
    assert score.isdigit()
    parsed_score = int(score)
    assert parsed_score >= 0 and parsed_score <= 10
    return score


def process_num_episodes_watched(num):
    assert num.isdigit()
    return num


def process_is_rewatching(rewatching):
    if rewatching == "-1":
        return "0"
    else:
        assert False


def process_start_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_finish_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_priority(priority):
    assert priority == "-1"
    return priority


def process_num_times_rewatched(rewatch):
    return rewatch


def process_rewatch_value(value):
    return value


def process_updated_at(time):
    return to_unix_time(time, "%Y-%m-%dT%H:%M:%S+00:00")


def process_username(username):
    return username

In [None]:
def process_header(header):
    assert header.split(",") == [
        "uid",
        "status",
        "score",
        "num_episodes_watched",
        "is_rewatching",
        "start_date",
        "finish_date",
        "priority",
        "num_times_rewatched",
        "rewatch_value",
        "updated_at",
        "username",
    ]
    return ",".join(
        [
            "username",
            "animeid",
            "score",
            "timestamp",
            "status",
            "episodes",
            "rewatch",
            "source",
        ]
    )


def process_line(line):
    try:
        fields = line.split(",")
        anime_uid = process_uid(fields[0])
        status = process_status(fields[1])
        score = process_score(fields[2])
        episodes = process_num_episodes_watched(fields[3])
        rewatch = process_is_rewatching(fields[9])
        timestamp = process_updated_at(fields[10])
        username = process_username(fields[11])
        source = "MAL"
    except Exception as e:
        print(f"Error: could not parse {line}")
        raise e
    return ",".join(
        [username, anime_uid, score, timestamp, status, episodes, rewatch, source]
    )

In [None]:
def process_user_anime_lists(infile, outfile):
    logger.info(f"processing entries in {infile}")
    with open(infile, "r") as in_file:
        with open(outfile, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                line = line.strip()
                if not header:
                    header = True
                    out_file.write(process_header(line) + "\n")
                    continue
                out_file.write(process_line(line) + "\n")

In [None]:
verify_user_anime_list_consistency(
    os.path.join(source_dir, "user_anime_facts/user_anime_list.csv")
)

In [None]:
process_user_anime_lists(
    os.path.join(source_dir, "user_anime_facts/user_anime_list.csv"),
    os.path.join(outdir, "user_anime_list.mal.csv"),
)