# Match items using Mal IDs

In [None]:
import glob
import os
from functools import cache

import pandas as pd
from tqdm import tqdm

In [None]:
@cache
def get_valid_ids(medium, source):
    df = pd.read_csv(
        f"../../../data/media/sources/{source}.{medium}.csv",
        keep_default_na=False,
        dtype=str,
    )
    return set(df.uid)

In [None]:
def anilist_to_mal(medium):
    anilist_to_mal = {}
    valid_mal_ids = get_valid_ids(medium, "mal")
    valid_anilist_ids = get_valid_ids(medium, "anilist")
    for f in glob.glob(f"../../../data/anilist/media_facts/{medium}.*.csv"):
        df = pd.read_csv(f, keep_default_na=False, dtype=str)
        for i in range(len(df)):
            malid = df.iloc[i].malid
            if malid.endswith(".0"):
                malid = malid[: -len(".0")]
            if malid != "":
                malid = malid
                anilistid = df.iloc[i].anilistid
                if malid in valid_mal_ids and anilistid in valid_anilist_ids:
                    anilist_to_mal[anilistid] = malid
    return anilist_to_mal

In [None]:
def kitsu_to_mal(medium):
    kitsu_to_mal = {}
    valid_mal_ids = get_valid_ids(medium, "mal")
    valid_kitsu_ids = get_valid_ids(medium, "kitsu")
    for f in tqdm(
        glob.glob(f"../../../data/kitsu/user_media_facts/user_{medium}_list.*.csv")
    ):
        df = pd.read_csv(f, keep_default_na=False, dtype=str)
        df = df[["kitsuid", "malid"]].drop_duplicates().query("malid != ''")
        df = df.loc[
            lambda x: x.malid.isin(valid_mal_ids) & x.kitsuid.isin(valid_kitsu_ids)
        ]
        kitsu_to_mal |= df.set_index("kitsuid")["malid"].to_dict()
    return kitsu_to_mal

In [None]:
def get_malid_mapping(medium, source1, source2):
    mapping = {}
    if source2 == "mal":
        if source1 == "anilist":
            mapping = anilist_to_mal(medium)
        elif source1 == "kitsu":
            mapping = kitsu_to_mal(medium)
    keys = list(mapping)
    values = [mapping[k] for k in keys]
    return pd.DataFrame.from_dict({source1: keys, source2: values})

In [None]:
outdir = "../../../data/media/malid"
os.makedirs(outdir, exist_ok=True)
os.chdir(outdir)
sources = ["mal", "anilist", "kitsu", "animeplanet"]
for medium in ["manga", "anime"]:
    for i in range(len(sources)):
        for j in range(i + 1, len(sources)):
            df = get_malid_mapping(medium, sources[j], sources[i])
            df.to_csv(f"{medium}.{sources[j]}.{sources[i]}.csv", index=False)