# Standardize
* aligns media facts from different sources
* Note that the genre column is still unique per source

In [None]:
import glob
import json
import os
import re

import pandas as pd
from tqdm import tqdm

In [None]:
def is_valid_date(fields):
    if len(fields) == 0:
        return True
    names = ["%Y", "%m", "%d"]
    try:
        pd.to_datetime("-".join(fields), format="-".join(names[: len(fields)]))
        return True
    except:
        return False


def make_date(year, month, day):
    fields = []
    if year is not None:
        fields.append(str(year))
        if month is not None:
            fields.append(str(month))
            if day is not None:
                fields.append(str(day))
    assert all(x.isdigit() for x in fields), (year, month, day)
    while not is_valid_date(fields):
        fields = fields[:-1]
    return "-".join(fields)

In [None]:
def get_version(version):
    return tuple(int(x) for x in version.split("."))

In [None]:
def import_media_relations(source, medium):
    df = pd.concat(
        (
            pd.read_csv(x)
            for x in glob.glob(
                f"../../../data/{source}/media_facts/{medium}_relations.*.csv"
            )
        ),
        ignore_index=True,
    )
    relations = {
        "adaptation",
        "prequel",
        "sequel",
        "side_story",
        "alternative_version",
        "parent_story",
        "other",
        "summary",
        "alternative_setting",
        "character",
        "full_story",
        "spin_off",
    }
    if source == "mal":
        for c in ["relation", "source_media", "target_media"]:
            df[c] = df[c].str.lower()
    elif source == "anilist":
        for c in ["relation", "source_media", "target_media"]:
            df[c] = df[c].str.lower()
        df["relation"] = df["relation"].replace(
            {
                "alternative": "alternative_version",
                "parent": "parent_story",
            }
        )
    elif source == "kitsu":
        df["relation"] = df["relation"].replace({"spinoff": "spin_off"})
    if source == "animeplanet":
        idx = df["api_version"].apply(get_version) <= get_version("3.0.0")
        temp_target_id = df.loc[idx, "target_id"]
        temp_target_media = df.loc[idx, "target_media"]
        df.loc[idx, "target_id"] = temp_target_media
        df.loc[idx, "target_media"] = temp_target_id
        df["relation"] = df["relation"].replace({"relation": "other"})
    missing_relations = set(df.relation) - set(relations)
    assert len(missing_relations) == 0, missing_relations
    missing_media = set(df.target_media) - {"manga", "anime"}
    assert len(missing_media) == 0, missing_media
    assert all(df["source_media"] == medium), set(df["source_media"])
    df = df[["relation", "source_id", "source_media", "target_id", "target_media"]]
    return df

In [None]:
def import_mal(medium):
    def episodes(x):
        if x in ["", "Unknown"]:
            return ""
        return int(x)

    def duration(x):
        if x in ["", "Unknown"]:
            return ""
        if x.endswith(" per ep."):
            x = x[: -len(" per ep.")]
        total = 0
        if "hr." in x:
            hours = int(x.split(" hr.")[0].split()[-1])
            total += hours * 60
        if "min." in x:
            minutes = int(x.split(" min.")[0].split()[-1])
            total += minutes
        if "sec." in x:
            seconds = int(x.split(" sec.")[0].split()[-1])
            total += seconds / 60
        return total

    def chapters(x):
        if x in ["", "?"]:
            return ""
        return int(x)

    def volumes(x):
        if x in ["", "?"]:
            return ""
        return int(x)

    def date(x):
        if x in ["Not available", "?"]:
            return ""
        months = {
            "Jan": 1,
            "Feb": 2,
            "Mar": 3,
            "Apr": 4,
            "May": 5,
            "Jun": 6,
            "Jul": 7,
            "Aug": 8,
            "Sep": 9,
            "Oct": 10,
            "Nov": 11,
            "Dec": 12,
        }
        fields = x.replace(",", "").split()
        if len(fields) == 3:
            return make_date(fields[2], months.get(fields[0], None), fields[1])
        elif len(fields) == 2:
            return make_date(fields[1], months.get(fields[0], None), None)
        else:
            return make_date(fields[0], None, None)

    def mediatype(x, medium):
        maps = {
            "anime": {
                "ONA": "ONA",
                "TV": "TV",
                "OVA": "OVA",
                "Special": "Special",
                "Movie": "Movie",
                "Music": "Music",
                "TV Special": "TV Special",
                "CM": "CM",
                "PV": "PV",
                "Unknown": "",
            },
            "manga": {
                "Manhwa": "Manhwa",
                "Manhua": "Manhua",
                "Manga": "Manga",
                "One-shot": "One-shot",
                "Light Novel": "Light Novel",
                "Doujinshi": "Doujinshi",
                "Novel": "Novel",
            },
        }
        return maps[medium][x]

    def status(x, startdate, medium):
        maps = {
            "anime": {
                "Finished Airing": "Finished",
                "Currently Airing": "Releasing",
                "Not yet aired": "Upcoming" if startdate != "" else "TBA",
                "": "",
            },
            "manga": {
                "Finished": "Finished",
                "Publishing": "Releasing",
                "On Hiatus": "On Hiatus",
                "Not yet published": "Upcoming" if startdate != "" else "TBA",
                "Discontinued": "Cancelled",
                "": "",
            },
        }
        return maps[medium][x]

    def season(x):
        if x == "":
            return ""
        fields = x.split()
        return "-".join([fields[1], fields[0].lower()])

    def source(x):
        if x == "":
            return ""
        x = x.strip()
        source_map = {
            "Original": "Original",
            "Manga": "Manga",
            "Unknown": "",
            "Game": "Game",
            "Other": "Other",
            "Visual novel": "Visual Novel",
            "Light novel": "Light Novel",
            "Novel": "Novel",
            "Web manga": "Web Manga",
            "4-koma manga": "4-koma Manga",
            "Music": "Music",
            "Picture book": "Picture Book",
            "Mixed media": "Mixed Media",
            "Book": "Book",
            "Web novel": "Web Novel",
            "Card game": "Card Game",
            "Radio": "Radio",
            "": "",
        }
        return source_map[x]

    def to_json(x):
        return json.dumps(eval(x))

    def maybe_parse(df, col, parsefn):
        if col not in df.columns:
            return ""
        return df[col].apply(parsefn)

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/mal/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["uid"]
        df["title"] = r["title"]
        df["alttitle"] = r["english_title"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"].apply(lambda x: mediatype(x, medium))
        df["startdate"] = r["start_date"].apply(date)
        df["enddate"] = r["end_date"].apply(date)
        df["episodes"] = maybe_parse(r, "num_episodes", episodes)
        df["duration"] = maybe_parse(r, "duration", duration)
        df["chapters"] = maybe_parse(r, "num_chapters", chapters)
        df["volumes"] = maybe_parse(r, "num_volumes", chapters)
        df["status"] = [
            status(s, d, medium) for (s, d) in zip(r["status"], df["startdate"])
        ]
        df["season"] = maybe_parse(r, "season", season)
        df["studios"] = r["studios"].apply(to_json)
        df["genres"] = r["genres"].apply(to_json)
        df["source"] = maybe_parse(r, "source", source)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_anilist(medium):
    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def date(x):
        year, month, day = x.split("-")
        nn = lambda x: x if x != "" else None
        return make_date(nn(year), nn(month), nn(day))

    def mediatype(x, medium):
        maps = {
            "anime": {
                "SPECIAL": "Special",
                "ONA": "ONA",
                "OVA": "OVA",
                "TV": "TV",
                "MOVIE": "Movie",
                "TV_SHORT": "TV",
                "MUSIC": "Music",
                "": "",
            },
            "manga": {
                "MANGA": "Manga",
                "ONE_SHOT": "One-shot",
                "NOVEL": "Light Novel",
                "": "",
            },
        }
        return maps[medium][x]

    def status(x, startdate, medium):
        maps = {
            "anime": {
                "FINISHED": "Finished",
                "NOT_YET_RELEASED": "Upcoming" if startdate != "" else "TBA",
                "RELEASING": "Releasing",
                "CANCELLED": "Cancelled",
                "": "",
            },
            "manga": {
                "FINISHED": "Finished",
                "RELEASING": "Releasing",
                "NOT_YET_RELEASED": "Upcoming" if startdate != "" else "TBA",
                "CANCELLED": "Cancelled",
                "HIATUS": "On Hiatus",
                "": "",
            },
        }
        return maps[medium][x]

    def season(x):
        if x == " ":
            return ""
        fields = x.split()
        return "-".join([fields[0], fields[1].lower()])

    def source(x):
        if x == "":
            return ""
        source_map = {
            "ORIGINAL": "Original",
            "MANGA": "Manga",
            "": "",
            "VIDEO_GAME": "Game",
            "VISUAL_NOVEL": "Visual Novel",
            "LIGHT_NOVEL": "Light Novel",
            "OTHER": "Other",
            "NOVEL": "Novel",
            "WEB_NOVEL": "Web Novel",
            "MULTIMEDIA_PROJECT": "Mixed Media",
            "PICTURE_BOOK": "Picture Book",
            "DOUJINSHI": "Doujinshi",
            "GAME": "Game",
            "ANIME": "Anime",
            "LIVE_ACTION": "Other",
            "COMIC": "Other",
        }
        return source_map[x.strip()]

    def to_json(x):
        return json.dumps(eval(x))

    def maybe_parse(df, col, parsefn):
        if col not in df.columns:
            return ""
        return df[col].apply(parsefn)

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/anilist/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["anilistid"]
        df["title"] = r["title"]
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"].apply(lambda x: mediatype(x, medium))
        df["startdate"] = r["startdate"].apply(date)
        df["enddate"] = r["enddate"].apply(date)
        df["episodes"] = r["episodes"].apply(parseint)
        df["duration"] = maybe_parse(r, "duration", parseint)
        df["chapters"] = r["chapters"].apply(parseint)
        df["volumes"] = r["volumes"].apply(parseint)
        if "status" in r:
            df["status"] = [
                status(s, d, medium) for (s, d) in zip(r["status"], df["startdate"])
            ]
        else:
            df["status"] = ""
        df["season"] = r["season"].apply(season)
        df["studios"] = r["studios"].apply(to_json)
        df["genres"] = r["genres"].apply(to_json)
        df["source"] = maybe_parse(r, "source", source)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_kitsu(medium):
    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def date(x):
        if x == "":
            return ""
        year, month, day = x.split("-")

        def nn1(x):
            if x == "":
                return None
            if int(x) in [0, 1]:
                return None
            return x

        # kitsu marks yyyy as yyyy-01-01 and yyyy-mm as yyyy-mm-01
        # if the exact date is unknown
        return make_date(nn1(year), nn1(month), nn1(day))

    def mediatype(x, medium):
        maps = {
            "anime": {
                "movie": "Movie",
                "TV": "TV",
                "ONA": "ONA",
                "OVA": "OVA",
                "special": "Special",
                "music": "Music",
            },
            "manga": {
                "doujin": "Doujinshi",
                "manhwa": "Manhwa",
                "manga": "Manga",
                "manhua": "Manhua",
                "novel": "Light Novel",
                "oel": "OEL",
                "oneshot": "One-shot",
            },
        }
        return maps[medium][x]

    def status(x, medium):
        maps = {
            "anime": {
                "finished": "Finished",
                "current": "Releasing",
                "tba": "TBA",
                "unreleased": "TBA",
                "upcoming": "Upcoming",
                "": "",
            },
            "manga": {
                "finished": "Finished",
                "current": "Releasing",
                "tba": "TBA",
                "upcoming": "Upcoming",
                "": "",
            },
        }
        return maps[medium][x]

    def season(x):
        if x == " ":
            return ""
        fields = x.split()
        return "-".join([fields[0], fields[1].lower()])

    def to_json(x):
        return json.dumps(eval(x))

    def maybe_parse(df, col, parsefn):
        if col not in df.columns:
            return ""
        return df[col].apply(parsefn)

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/kitsu/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["kitsuid"]
        df["title"] = r["title"]
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"].apply(lambda x: mediatype(x, medium))
        df["startdate"] = r["startdate"].apply(date)
        df["enddate"] = r["enddate"].apply(date)
        df["episodes"] = r["episodes"].apply(parseint)
        df["duration"] = maybe_parse(r, "duration", parseint)
        df["chapters"] = r["chapters"].apply(parseint)
        df["volumes"] = r["volumes"].apply(parseint)
        df["status"] = maybe_parse(r, "status", lambda x: status(x, medium))
        df["season"] = ""
        df["studios"] = "[]"
        df["genres"] = r["genres"].apply(to_json)
        df["source"] = ""
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_animeplanet(medium):
    def unpack(x):
        assert len(x) == 1
        return x[0]

    def maybe_unpack(x):
        if len(x) == 0:
            return ""
        assert len(x) == 1
        return x[0]

    def title(x):
        suffixes = [
            " (Light Novel)",
            " (Novel)",
            " (Pilot)",
            " (Promo)",
            " (Doujinshi)",
            " (One Shot)",
        ]
        for s in suffixes:
            if x.endswith(s):
                return x[: -len(s)]
        return x

    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def startdate(x):
        if x == "TBA ":
            return ""
        fields = x.strip().split(" - ")
        if fields[0] in ["?", ""]:
            return ""
        return make_date(fields[0], None, None)

    def enddate(x):
        if x == "TBA ":
            return ""
        fields = x.strip().split(" - ")
        if len(fields) == 1:
            return ""
        assert len(fields) == 2
        if fields[1] in ["?", ""]:
            return ""
        return make_date(fields[1], None, None)

    def mediatype(x, medium):
        if medium == "manga":
            suffixes = {
                " (Light Novel)": "Light Novel",
                " (Novel)": "Novel",
                " (Pilot)": "One-shot",
                " (Promo)": "One-shot",
                " (Doujinshi)": "Doujinshi",
                " (One Shot)": "One-shot",
            }
            for s in suffixes:
                if x.endswith(s):
                    return suffixes[s]
            return "Manga"
        elif medium == "anime":
            key = x.strip().split("\n")[0]
            map = {
                "TV": "TV",
                "Web": "ONA",
                "Movie": "Movie",
                "TV Special": "TV Special",
                "OVA": "OVA",
                "Music Video": "Music",
                "DVD Special": "Special",
                "Other": "",
                "": "",
            }
            return map[key]
        else:
            assert False

    def episodes(x, medium):
        if medium != "anime":
            return ""
        fields = x.strip().split("\n")
        if len(fields) != 2:
            return ""
        key = fields[1].split(" x ")[0]
        matchfield = """([^<>]+)"""
        regex = re.compile("\(" + matchfield + " ep")
        return unpack(regex.findall(key))

    def duration(x, medium):
        if medium != "anime":
            return ""
        fields = x.strip().split("\n")
        if len(fields) != 2:
            return ""
        fields = fields[1].split(" x ")
        if len(fields) != 2:
            return ""
        key = fields[1]
        matchfield = """([^<>]+)"""
        regex = re.compile(matchfield + " min\)")
        return unpack(regex.findall(key))

    def chapters(x, medium):
        if medium != "manga":
            return ""
        key = x.strip()
        matchfield = """([^<>;]+)"""
        regex = re.compile("Ch: " + matchfield)
        return maybe_unpack(regex.findall(key))

    def volumes(x, medium):
        if medium != "manga":
            return ""
        key = x.strip()
        matchfield = """([^<>;]+)"""
        regex = re.compile("Vol: " + matchfield)
        return maybe_unpack(regex.findall(key))

    def season(x):
        if x == "":
            return ""
        fields = x.split("-")
        return "-".join([fields[1], fields[0]])

    def source(x):
        if x == "":
            return ""
        source_map = {
            "based-on-a-web-novel": "Web Novel",
            "based-on-a-manga": "Manga",
            "based-on-a-light-novel": "Light Novel",
            "based-on-a-novel": "Novel",
            "based-on-a-video-game": "Game",
            "based-on-a-visual-novel": "Visual Novel",
            "based-on-an-anime": "Anime",
            "based-on-a-mobile-game": "Game",
            "based-on-a-doujinshi": "Doujinshi",
            "based-on-an-eroge": "Game",
            "based-on-a-fairy-tale": "Other",
            "based-on-a-4-koma-manga": "4-koma Manga",
            "based-on-an-otome-game": "Game",
            "based-on-a-movie": "Other",
            "based-on-a-webtoon": "Web Manga",
            "based-on-a-song": "Other",
            "based-on-a-card-game": "Card Game",
            "based-on-a-picture-book": "Picture Book",
            "based-on-a-tv-series": "Other",
            "based-on-a-cartoon": "Other",
            "based-on-a-play": "Other",
            "based-on-a-comic-book": "Other",
            "based-on-a-religious-text": "",
        }
        for k in eval(x):
            if k in source_map:
                return source_map[k]
        return ""

    def to_json(x):
        return json.dumps(eval(x))

    def studios(x, version):
        if get_version(version) >= get_version("1.1.0"):
            if x == "":
                x = "[]"
            return to_json(x)
        else:
            return json.dumps([s.replace("-", " ") for s in x.split()])

    def maybe_parse(df, col, parsefn):
        if col not in df.columns:
            return ""
        return df[col].apply(parsefn)

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/animeplanet/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["url"]
        df["title"] = r["title"].apply(title)
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"].apply(lambda x: x.strip())
        if medium == "manga":
            df["mediatype"] = r["title"].apply(lambda x: mediatype(x, medium))
        elif medium == "anime":
            df["mediatype"] = maybe_parse(r, "type", lambda x: mediatype(x, medium))
        df["startdate"] = r["year"].apply(startdate)
        df["enddate"] = r["year"].apply(enddate)
        df["episodes"] = maybe_parse(r, "type", lambda x: episodes(x, medium))
        df["duration"] = maybe_parse(r, "type", lambda x: duration(x, medium))
        df["chapters"] = maybe_parse(r, "type", lambda x: chapters(x, medium))
        df["volumes"] = maybe_parse(r, "type", lambda x: volumes(x, medium))
        df["status"] = ""
        df["season"] = r["season"].apply(season)
        df["studios"] = [
            studios(x, y) for (x, y) in zip(r["studios"], r["api_version"])
        ]
        df["genres"] = r["genres"].apply(to_json)
        df["source"] = r["genres"].apply(source)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    if medium == "manga":
        # if an item has multiple versions and isn't a light novel, assume it's a manga
        r = import_media_relations("animeplanet", "manga")
        r = r.query("source_media == 'manga' and target_media == 'manga'")
        uids = set(r["source_id"]) | set(r["target_id"])
        idx = df.uid.isin(uids)
        df.loc[idx, "mediatype"] = df.loc[idx, "mediatype"].replace({"": "Manga"})
    return df

In [None]:
MEDIATYPES = {
    "anime": {
        "ONA",
        "TV",
        "Music",
        "CM",
        "PV",
        "OVA",
        "Special",
        "TV Special",
        "Movie",
        "",
    },
    "manga": {
        "Manhwa",
        "Manhua",
        "Manga",
        "OEL",
        "Light Novel",
        "Novel",
        "One-shot",
        "Doujinshi",
        "",
    },
}

STATUSES = {
    "anime": {
        "Finished",
        "Releasing",
        "Upcoming",
        "TBA",
        "Cancelled",
        "",
    },
    "manga": {
        "Finished",
        "Releasing",
        "Upcoming",
        "TBA",
        "Cancelled",
        "On Hiatus",
        "",
    },
}

SOURCES = {
    "Original",
    "Manga",
    "",
    "Game",
    "Other",
    "Visual Novel",
    "Light Novel",
    "Novel",
    "Web Manga",
    "4-koma Manga",
    "Music",
    "Picture Book",
    "Doujinshi",
    "Mixed Media",
    "Book",
    "Web Novel",
    "Card Game",
    "Radio",
    "Anime",
}


def validate(df, medium):
    assert set(df.mediatype) <= MEDIATYPES[medium], (
        set(df.mediatype) - MEDIATYPES[medium]
    )
    assert set(df.status) <= STATUSES[medium], set(df.status) - STATUSES[medium]
    assert set(df.source) <= SOURCES, set(df.source) - SOURCES

In [None]:
# save media
outdir = "../../../data/media/sources"
if not os.path.exists(outdir):
    os.makedirs(outdir, exist_ok=True)
import_fns = {
    "mal": import_mal,
    "anilist": import_anilist,
    "kitsu": import_kitsu,
    "animeplanet": import_animeplanet,
}
for medium in ["manga", "anime"]:
    for source, fn in import_fns.items():
        df = fn(medium)
        validate(df, medium)
        df.to_csv(f"{outdir}/{source}.{medium}.csv", index=False)
        r = import_media_relations(source, medium)
        r.to_csv(f"{outdir}/{source}.{medium}_relations.csv", index=False)

In [None]:
# save deterministic RNG seed
max_ts = float("-inf")
for s in ["mal", "anilist", "kitsu", "animeplanet"]:
    files = glob.glob(f"../../../data/{s}/user_media_facts/user_status.*.csv")
    ts = (
        pd.concat(pd.read_csv(f, dtype=str) for f in files)["last_update_timestamp"]
        .astype(int)
        .max()
    )
    max_ts = max(max_ts, ts)
with open("../../../data/rng.csv", "w") as f:
    f.write("seed\n")
    f.write(f"{max_ts}\n")