## Get User Lists

In [None]:
import datetime
import glob
import os

import pandas as pd
from tqdm import tqdm

In [None]:
ANILIST_HEADER = [
    "uid",
    "score",
    "status",
    "num_episodes_watched",
    "updated_at",
    "created_at",
    "is_rewatching",
    "username",
]

In [None]:
# remove any corrupted lines
def verify_user_media_list_consistency(input_fn):
    logger.info(f"Verifying consistency of entries in {input_fn}")
    partition = input_fn.split(".")[-2]
    output_fn = input_fn + "~"
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = ",".join(ANILIST_HEADER)
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 8:
                    logger.warning(
                        f"Deleting malformed line in user_{media}_list.csv {line} "
                    )
                    continue
                out_file.write(line)
        os.replace(output_fn, input_fn)

In [None]:
def process_uid(uid):
    if uid.endswith(".0"):
        uid = uid[: -len(".0")]
    assert uid.isdigit()
    return uid


def process_status(status):
    if status == "completed":
        return "5"
    elif status == "watching":
        return "4"
    elif status == "on_hold":
        return "3"
    elif status == "dropped":
        return "2"
    elif status == "plan_to_watch":
        return "1"
    else:
        assert False


def isfloat(x):
    try:
        float(x)
        return True
    except Exception as e:
        return False


def process_score(score):
    assert isfloat(score)
    parsed_score = float(score)
    if not (parsed_score >= 0 and parsed_score <= 10):
        logger.warning(f"invalid score {parsed_score}, replacing with 0")
        parsed_score = 0
    return score


def process_num_episodes_watched(num):
    assert num.isdigit()
    return num


def process_timestamp(time):
    if time[0] == "-":
        time = "0"
    assert time.isdigit()
    return time


def process_username(username):
    return username

In [None]:
def select_timestamp(updated_ts, created_ts):
    # prefer the updated timestamp (for consistency with MAL)
    # but use the created timestamp if not available
    if int(updated_ts) == 0:
        if int(created_ts) == 0:
            return updated_ts
        else:
            return created_ts
    else:
        return updated_ts

In [None]:
def get_output_header(media):
    if media == "anime":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "episodes",
            "userid",
            "source",
        ]
    elif media == "manga":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "volumes",
            "chapters",
            "userid",            
            "source",
        ]
    else:
        assert False

In [None]:
def process_header(header, media):
    assert header.split(",") == ANILIST_HEADER
    return ",".join(get_output_header(media))


def process_line(line):
    output_fields = []
    try:
        fields = line.split(",")
        output_fields.append("AniList@" + fields[ANILIST_HEADER.index("username")])        
        output_fields.append(process_uid(fields[ANILIST_HEADER.index("uid")]))
        output_fields.append(process_score(fields[ANILIST_HEADER.index("score")]))
        updated_ts = process_timestamp(fields[ANILIST_HEADER.index("updated_at")])
        created_ts = process_timestamp(fields[ANILIST_HEADER.index("created_at")])
        output_fields.append(select_timestamp(updated_ts, created_ts))        
        output_fields.append(process_status(fields[ANILIST_HEADER.index("status")]))
        if media == "anime":
            output_fields.append(
                process_num_episodes_watched(
                    fields[ANILIST_HEADER.index("num_episodes_watched")]
                )
            )
        elif media == "manga":
            # volumes
            # anilist doesn't record num_volumes, so let's set it to 0            
            output_fields.append("0") 
            # chapters
            output_fields.append(
                process_num_episodes_watched(
                    fields[ANILIST_HEADER.index("num_episodes_watched")]
                )
            ) 
        else:
            assert False
        output_fields.append(process_username(fields[ANILIST_HEADER.index("username")]))            
        output_fields.append("AniList")
    except Exception as e:
        print(f"Error: could not parse {line}")
        raise e
    assert len(output_fields) == len(get_output_header(media))
    return ",".join(output_fields)

In [None]:
def process_user_media_lists(infile, outfile):
    logger.info(f"processing entries in {infile}")
    needs_header = not os.path.exists(outfile)
    with open(infile, "r") as in_file:
        with open(outfile, "a") as out_file:
            header = False
            for line in tqdm(in_file):
                line = line.strip()
                if not header:
                    header = True
                    if needs_header:
                        out_file.write(process_header(line, media) + "\n")
                    continue
                out_file.write(process_line(line) + "\n")