# Standardize
* aligns media facts from different sources
* Note that the status, genre, and type columns are still unique per source

In [None]:
import glob
import json

import pandas as pd
from tqdm import tqdm

In [None]:
def make_date(year, month, day):
    fields = []
    if year is not None:
        fields.append(str(year))
        if month is not None:
            fields.append(str(month))
            if day is not None:
                fields.append(str(day))
    return "-".join(fields)

In [None]:
def import_mal_anime():
    def episodes(x):
        if x in ["", "Unknown"]:
            return ""
        return int(x)

    def date(x):
        if x == "Not available":
            return ""
        months = {
            "Jan": 1,
            "Feb": 2,
            "Mar": 3,
            "Apr": 4,
            "May": 5,
            "Jun": 6,
            "Jul": 7,
            "Aug": 8,
            "Sep": 9,
            "Oct": 10,
            "Nov": 11,
            "Dec": 12,
        }
        fields = x.replace(",", "").split()
        if len(fields) == 3:
            return make_date(fields[2], months[fields[0]], fields[1])
        elif len(fields) == 2:
            return make_date(fields[1], months[fields[0]], None)
        else:
            return make_date(fields[0], None, None)

    def season(x):
        if x == "":
            return ""
        fields = x.split()
        return "-".join([fields[1], fields[0].lower()])

    def to_json(x):
        return json.dumps(eval(x))

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/mal/media_facts/anime.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["uid"]
        df["title"] = r["title"]
        df["alttitle"] = r["english_title"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"]  # TODO
        df["startdate"] = r["start_date"].apply(date)
        df["enddate"] = r["end_date"].apply(date)
        df["episodes"] = r["num_episodes"].apply(episodes)
        df["chapters"] = ""
        df["volumes"] = ""
        df["status"] = r["status"]  # TODO
        df["season"] = r["season"].apply(season)
        df["studios"] = r["studios"].apply(to_json)
        df["genres"] = r["genres"].apply(to_json)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_mal_manga():
    def chapters(x):
        if x in ["", "?"]:
            return ""
        return int(x)

    def volumes(x):
        if x in ["", "?"]:
            return ""
        return int(x)

    def date(x):
        if x == "Not available":
            return ""
        months = {
            "Jan": 1,
            "Feb": 2,
            "Mar": 3,
            "Apr": 4,
            "May": 5,
            "Jun": 6,
            "Jul": 7,
            "Aug": 8,
            "Sep": 9,
            "Oct": 10,
            "Nov": 11,
            "Dec": 12,
        }
        fields = x.replace(",", "").split()
        if len(fields) == 3:
            return make_date(fields[2], months.get(fields[0], None), fields[1])
        elif len(fields) == 2:
            return make_date(fields[1], months.get(fields[0], None), None)
        else:
            return make_date(fields[0], None, None)

    def to_json(x):
        return json.dumps(eval(x))

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/mal/media_facts/manga.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["uid"]
        df["title"] = r["title"]
        df["alttitle"] = r["english_title"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"]
        df["startdate"] = r["start_date"].apply(date)
        df["enddate"] = r["end_date"].apply(date)
        df["episodes"] = ""
        df["chapters"] = r["num_chapters"].apply(chapters)
        df["volumes"] = r["num_volumes"].apply(volumes)
        df["status"] = r["status"]
        df["season"] = ""
        df["studios"] = r["studios"].apply(to_json)
        df["genres"] = r["genres"].apply(to_json)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_mal(medium):
    fns = {
        "anime": import_mal_anime,
        "manga": import_mal_manga,
    }
    return fns[medium]()

In [None]:
def import_anilist(medium):
    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def date(x):
        year, month, day = x.split("-")
        nn = lambda x: x if x != "" else None
        return make_date(nn(year), nn(month), nn(day))

    def season(x):
        if x == " ":
            return ""
        fields = x.split()
        return "-".join([fields[0], fields[1].lower()])

    def to_json(x):
        return json.dumps(eval(x))

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/anilist/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["anilistid"]
        df["title"] = r["title"]
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"]
        df["startdate"] = r["startdate"].apply(date)
        df["enddate"] = r["enddate"].apply(date)
        df["episodes"] = r["episodes"].apply(parseint)
        df["chapters"] = r["chapters"].apply(parseint)
        df["volumes"] = r["volumes"].apply(parseint)
        df["status"] = ""
        df["season"] = r["season"].apply(season)
        df["studios"] = r["studios"].apply(to_json)
        df["genres"] = r["genres"].apply(to_json)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_kitsu(medium):
    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def date(x):
        if x == "":
            return ""
        year, month, day = x.split("-")

        def nn(x):
            if x == "":
                return None
            if int(x) == 0:
                return None
            return x

        return make_date(nn(year), nn(month), nn(day))

    def season(x):
        if x == " ":
            return ""
        fields = x.split()
        return "-".join([fields[0], fields[1].lower()])

    def to_json(x):
        return json.dumps(eval(x))

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/kitsu/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["kitsuid"]
        df["title"] = r["title"]
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"]
        df["mediatype"] = r["type"]
        df["startdate"] = r["startdate"].apply(date)
        df["enddate"] = r["enddate"].apply(date)
        df["episodes"] = r["episodes"].apply(parseint)
        df["chapters"] = r["chapters"].apply(parseint)
        df["volumes"] = r["volumes"].apply(parseint)
        df["status"] = ""
        df["season"] = ""
        df["studios"] = ""
        df["genres"] = r["genres"].apply(to_json)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

In [None]:
def import_animeplanet(medium):
    def parseint(x):
        if x == "":
            return ""
        if x.endswith(".0"):
            x = x[: -len(".0")]
        return int(x)

    def startdate(x):
        if x == "TBA ":
            return ""
        fields = x.strip().split(" - ")
        return make_date(fields[0], None, None)

    def enddate(x):
        if x == "TBA ":
            return ""
        fields = x.strip().split(" - ")
        if len(fields) == 1:
            return ""
        assert len(fields) == 2
        return make_date(fields[1], None, None)

    def season(x):
        if x == "":
            return ""
        fields = x.split("-")
        return "-".join([fields[1], fields[0]])

    def studios(x):
        return json.dumps(x.split())

    def to_json(x):
        return json.dumps(eval(x))

    dfs = []
    for f in tqdm(glob.glob(f"../../../data/animeplanet/media_facts/{medium}.*.csv")):
        r = pd.read_csv(f, keep_default_na=False)
        df = pd.DataFrame()
        df["uid"] = r["url"]
        df["title"] = r["title"]
        df["alttitle"] = r["alttitle"]
        df["summary"] = r["summary"].apply(lambda x: x.strip())
        df["mediatype"] = ""
        df["startdate"] = r["year"].apply(startdate)
        df["enddate"] = r["year"].apply(enddate)
        df["episodes"] = ""
        df["chapters"] = ""
        df["volumes"] = ""
        df["status"] = ""
        df["season"] = r["season"].apply(season)
        df["studios"] = r["studios"].apply(studios)
        df["genres"] = r["genres"].apply(to_json)
        df["accessed_at"] = r["accessed_at"]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
import_fns = {
    "mal": import_mal,
    "anilist": import_anilist,
    "kitsu": import_kitsu,
    "animeplanet": import_animeplanet,
}
for medium in ["manga", "anime"]:
    for (source, fn) in import_fns.items():
        fn(medium).to_csv(f"../../../data/media/{source}.{medium}.csv", index=False)