# Match Items Using Manami's Offline Database

In [None]:
import glob
import json
import logging
import os
from functools import cache

import pandas as pd
from curl_cffi import requests
from tqdm import tqdm

In [None]:
@cache
def get_offline_database(medium):
    # Imports mappings from https://github.com/manami-project/anime-offline-database
    assert medium == "anime"
    url = "https://github.com/manami-project/anime-offline-database/raw/master/anime-offline-database.json"
    max_timeout = 300
    timeout = 1
    while True:
        try:
            r = requests.get(url)
            if not r.ok:
                raise ValueError
            return r.json()
        except:
            time.sleep(timeout)
            timeout *= 2
            if timeout > max_timeout:
                logging.error("could not download mappings")
                return {}

In [None]:
@cache
def get_valid_ids(medium, source):
    df = pd.read_csv(
        f"../../../data/media/sources/{source}.{medium}.csv",
        keep_default_na=False,
        dtype=str,
    )
    return set(df.uid)

In [None]:
def get_database_mapping(medium, source1, source2):
    def get_key(urls, source):
        for x in d["sources"]:
            if source == "mal":
                if "myanimelist.net" in x:
                    return x.split("/")[-1]
            elif source == "anilist":
                if "anilist.co" in x:
                    return x.split("/")[-1]
            elif source == "animeplanet":
                if "anime-planet.com" in x:
                    return x.split("/")[-1]
            elif source == "kitsu":
                if "kitsu.app" in x:
                    return x.split("/")[-1]
        return None

    mapping = {}
    if medium == "anime":
        db = get_offline_database(medium)
        valid_keys1 = get_valid_ids(medium, source1)
        valid_keys2 = get_valid_ids(medium, source2)
        for d in db["data"]:
            k1 = get_key(d["sources"], source1)
            k2 = get_key(d["sources"], source2)
            if k1 in valid_keys1 and k2 in valid_keys2:
                mapping[k1] = k2
    keys = list(mapping)
    values = [mapping[k] for k in keys]
    return pd.DataFrame.from_dict({source1: keys, source2: values})

In [None]:
outdir = "../../../data/media/manami"
os.makedirs(outdir, exist_ok=True)
os.chdir(outdir)
sources = ["mal", "anilist", "kitsu", "animeplanet"]
for medium in ["manga", "anime"]:
    for i in range(len(sources)):
        for j in range(i + 1, len(sources)):
            df = get_database_mapping(medium, sources[j], sources[i])
            df.to_csv(f"{medium}.{sources[j]}.{sources[i]}.csv", index=False)