# Animeplanet Mappings
* map between animeplanet title and mal id

In [None]:
import glob
import html
import json
import os

import pandas as pd
import rapidfuzz
from tqdm import tqdm

In [None]:
os.chdir("../../data/raw_data")
os.makedirs("../processed_data", exist_ok=True)

In [None]:
def get_titles(input_fn):
    titles = {}
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            fields = line.strip().split(",")
            if not header:
                header = True
                header_fields = fields
                continue
            t = fields[header_fields.index("title")]
            if t not in titles:
                titles[t] = 0
            titles[t] += 1
    return titles

In [None]:
# TODO better fuzzy matching
def fuzzy_match(keys, database, matches, sanitize):
    # fuzzy match keys with database and append the output to matches
    # returns the number of extra matches
    total = 0
    db_keys = [sanitize(x) for x in database.keys()]
    db_values = list(database.values())
    for x in tqdm(keys):
        if x in matches:
            continue
        match = rapidfuzz.process.extractOne(
            sanitize(x),
            db_keys,
            scorer=rapidfuzz.fuzz.ratio,
            score_cutoff=85,
        )
        if match is not None:
            matches[x] = db_values[match[2]]
            total += 1
    return total

In [None]:
def get_keys(medium):
    media_path = os.path.join(
        "../animeplanet", f"user_media_facts/user_{medium}_list.*.csv"
    )
    media_fns = sorted(glob.glob(media_path))
    titles = {}
    for file in media_fns:
        for k, v in get_titles(file).items():
            if k not in titles:
                titles[k] = 0
            titles[k] += v
    return titles

# Load from db

In [None]:
def get_database_mapping():
    # Imports anime mappings from https://github.com/manami-project/anime-offline-database
    html_to_mal = {}
    try:
        if os.path.exists("anime-offline-database.json"):
            os.remove("anime-offline-database.json")
        json_fn = "https://github.com/manami-project/anime-offline-database/raw/master/anime-offline-database.json"
        ! wget $json_fn
        with open("anime-offline-database.json", "r") as f:
            db = json.load(f)
        for d in db["data"]:
            mal = None
            html = None
            for x in d["sources"]:
                if "myanimelist.net" in x:
                    mal = x.split("/")[-1]
                elif "anime-planet.com" in x:
                    html = x.split("/")[-1]
            if mal is not None and html is not None:
                html_to_mal[html] = int(mal)
    except Exception as e:
        print(str(e))
    return html_to_mal

In [None]:
def html_sanitize(x):
    return html.unescape(x.lower())

# Load from titles

In [None]:
def get_media_mapping(medium):
    df = pd.read_csv(f"{medium}.csv")
    has_english_title = ~df.english_title.isna()
    medium_to_suffix = {"light_novel": " (novel)", "novel": " (novel)"}
    synonym_to_uid = {}
    for i in range(len(df)):
        suffix = medium_to_suffix.get(df.type[i], "")
        synonym_to_uid[df.title[i] + suffix] = df[f"{medium}_id"][i]
        if has_english_title[i]:
            synonym_to_uid[df.english_title[i] + suffix] = df[f"{medium}_id"][i]
    return synonym_to_uid

# Save results

In [None]:
def write(medium, mapping):
    with open(f"../processed_data/animeplanet_{medium}_to_uid.csv", "w") as f:
        f.write(f"title,{medium}_id\n")
        for k, v in mapping.items():
            f.write(f"{k},{v}\n")

In [None]:
def write_unmapped(medium, keys, mapping):
    df = pd.DataFrame.from_dict(keys, orient="index").reset_index()
    df.columns = ["title", "count"]
    df = df.loc[lambda x: ~x.title.isin(mapping.keys())].sort_values(
        by="count", ascending=False
    )
    df.to_csv(f"../processed_data/animeplanet_{medium}_unmapped.csv", index=False)

In [None]:
anime_keys = get_keys("anime")
anime_to_uid = {}
fuzzy_match(list(anime_keys), get_database_mapping(), anime_to_uid, html_sanitize)
fuzzy_match(list(anime_keys), get_media_mapping("anime"), anime_to_uid, html_sanitize)
write("anime", anime_to_uid)
write_unmapped("anime", anime_keys, anime_to_uid)

In [None]:
manga_keys = get_keys("manga")
manga_to_uid = {}
fuzzy_match(list(manga_keys), get_media_mapping("manga"), manga_to_uid, html_sanitize)
write("manga", manga_to_uid)
write_unmapped("manga", manga_keys, manga_to_uid)

In [None]:
# TODO manually review unmapped items