## Get User Lists

In [None]:
import datetime
import os

import pandas as pd
from tqdm import tqdm

In [None]:
def get_mal_header(media):
    if media == "anime":
        return [
            "uid",
            "status",
            "score",
            "num_episodes_watched",
            "is_rewatching",
            "start_date",
            "finish_date",
            "priority",
            "num_times_rewatched",
            "rewatch_value",
            "updated_at",
            "username",
        ]
    elif media == "manga":
        return [
            "uid",
            "status",
            "score",
            "num_volumes",
            "num_chapters",
            "updated_at",
            "username",
        ]
    else:
        assert False

In [None]:
def get_output_header(media):
    if media == "anime":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "episodes",
            "source",
        ]
    elif media == "manga":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "volumes",
            "chapters",
            "source",
        ]
    else:
        assert False

In [None]:
# remove any corrupted lines
def verify_user_media_list_consistency(input_fn):
    logger.info(f"Verifying consistency of entries in {input_fn}")
    partition = input_fn.split(".")[-2]
    output_fn = input_fn + "~"
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = ",".join(get_mal_header(media)) + "\n"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != len(get_mal_header(media)):
                    logger.warning(
                        f"Deleting malformed line in user_{media}_list.csv {line} "
                    )
                    continue
                out_file.write(line)
    os.replace(output_fn, input_fn)

In [None]:
def to_unix_time(date, fmt):
    return str(int(datetime.datetime.timestamp(datetime.datetime.strptime(date, fmt))))

In [None]:
def process_uid(uid):
    assert uid.isdigit()
    return uid


def process_status(status):
    if status == "completed":
        return "5"
    elif status in ["watching", "reading"]:
        return "4"
    elif status == "on_hold":
        return "3"
    elif status == "dropped":
        return "2"
    elif status in ["plan_to_watch", "plan_to_read"]:
        return "1"
    elif status == "":
        return "0"
    else:
        assert False


def isfloat(x):
    try:
        float(x)
        return True
    except Exception as e:
        return False


def process_score(score):
    assert isfloat(score)
    parsed_score = float(score)
    assert parsed_score >= 0 and parsed_score <= 10
    return score


def process_num_episodes_watched(num):
    assert num.isdigit()
    return num


def process_is_rewatching(rewatching):
    if rewatching == "-1":
        return "0"
    else:
        assert False


def process_start_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_finish_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_priority(priority):
    assert priority == "-1"
    return priority


def process_num_times_rewatched(rewatch):
    return rewatch


def process_rewatch_value(value):
    return value


def process_updated_at(time):
    # this should be %Y-%m-%dT%H:%M:%S+00:00" but sometimes the dash
    # is corrupted and replaced with a different character
    time = time[0:4] + "-" + time[5:7] + "-" + time[8:]
    return to_unix_time(time, "%Y-%m-%dT%H:%M:%S+00:00")


def process_username(username):
    return username

In [None]:
def process_header(header):
    assert header.split(",") == get_mal_header(media)
    return ",".join(get_output_header(media))


def process_line(line):
    header = get_mal_header(media)
    output_fields = []
    try:
        fields = line.split(",")
        output_fields.append(process_username(fields[header.index("username")]))        
        output_fields.append(process_uid(fields[header.index("uid")]))
        output_fields.append(process_score(fields[header.index("score")]))
        output_fields.append(process_updated_at(fields[header.index("updated_at")]))
        output_fields.append(process_status(fields[header.index("status")]))
        if media == "anime":
            output_fields.append(
                process_num_episodes_watched(
                    fields[header.index("num_episodes_watched")]
                )
            )
        elif media == "manga":
            output_fields.append(
                process_num_episodes_watched(fields[header.index("num_volumes")])
            )
            output_fields.append(
                process_num_episodes_watched(fields[header.index("num_chapters")])
            )
        else:
            assert False
        output_fields.append("MAL")
    except Exception as e:
        print(f"Error: could not parse {line}")
        raise e
    assert len(output_fields) == len(get_output_header(media))
    return ",".join(output_fields)

In [None]:
def process_user_media_lists(infile, outfile):
    logger.info(f"processing entries in {infile}")
    needs_header = not os.path.exists(outfile)
    with open(infile, "r") as in_file:
        with open(outfile, "a") as out_file:
            header = False
            for line in tqdm(in_file):
                line = line.strip()
                if not header:
                    header = True
                    if needs_header:
                        out_file.write(process_header(line) + "\n")
                    continue
                out_file.write(process_line(line) + "\n")