# Animeplanet Mappings
* map between animeplanet title and mal id

In [None]:
import glob
import html
import json
import logging
import os
from functools import cache

import pandas as pd
import rapidfuzz
from tqdm import tqdm

In [None]:
os.chdir("../../data/raw_data")
os.makedirs("../processed_data", exist_ok=True)

In [None]:
def get_titles(input_fn):
    titles = {}
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            fields = line.strip().split(",")
            if not header:
                header = True
                header_fields = fields
                continue
            t = fields[header_fields.index("title")]
            if t not in titles:
                titles[t] = 0
            titles[t] += 1
    return titles

In [None]:
# TODO better fuzzy matching
def fuzzy_match(keys, database, matches, sanitize, cutoff):
    # fuzzy match keys with database and append the output to matches
    # returns the number of extra matches
    total = 0
    db_keys = [sanitize(x) for x in database.keys()]
    db_values = list(database.values())
    for x in tqdm(keys):
        if x in matches:
            continue
        match = rapidfuzz.process.extractOne(
            sanitize(x),
            db_keys,
            scorer=rapidfuzz.fuzz.ratio,
            score_cutoff=cutoff,
        )
        if match is not None:
            matches[x] = db_values[match[2]]
            total += 1
    return total

In [None]:
def get_keys(medium):
    media_path = os.path.join(
        "../animeplanet", f"user_media_facts/user_{medium}_list.*.csv"
    )
    media_fns = sorted(glob.glob(media_path))
    titles = {}
    for file in media_fns:
        for k, v in get_titles(file).items():
            if k not in titles:
                titles[k] = 0
            titles[k] += v
    return titles

In [None]:
def html_sanitize(x):
    return html.unescape(x.lower())

# Database Mappings

In [None]:
@cache
def get_database_mapping(medium):
    if medium != "anime":
        return {}
    # Imports anime mappings from https://github.com/manami-project/anime-offline-database
    html_to_mal = {}
    try:
        if os.path.exists("anime-offline-database.json"):
            os.remove("anime-offline-database.json")
        json_fn = "https://github.com/manami-project/anime-offline-database/raw/master/anime-offline-database.json"
        ! wget $json_fn
        with open("anime-offline-database.json", "r") as f:
            db = json.load(f)
        for d in db["data"]:
            mal = None
            html = None
            for x in d["sources"]:
                if "myanimelist.net" in x:
                    mal = x.split("/")[-1]
                elif "anime-planet.com" in x:
                    html = x.split("/")[-1]
            if mal is not None and html is not None:
                html_to_mal[html] = int(mal)
    except Exception as e:
        print(str(e))
    # the animeplanet html replaces spaces with dashes, so we don't
    # know if a dash is a true dash or a multiword title with spaces
    multiword = {}
    for x, y in html_to_mal.items():
        k = x.replace("-", " ")
        if k != x and k not in html_to_mal:
            multiword[k] = y
    html_to_mal |= multiword
    return html_to_mal

In [None]:
@cache
def get_media_mapping(medium):
    df = pd.read_csv(f"{medium}.csv")
    has_english_title = ~df.english_title.isna()
    type_to_suffixes = {
        "Light Novel": [" (Novel)", " (Light Novel)"],
        "Novel": [" (Novel)"],
        "One-shot": [" (Pilot)"],
    }
    synonym_to_uid = {}
    for i in range(len(df)):
        suffixes = type_to_suffixes.get(df.type[i], [""])
        for suffix in suffixes:
            synonym_to_uid[df.title[i] + suffix] = df[f"{medium}_id"][i]
            if has_english_title[i]:
                synonym_to_uid[df.english_title[i] + suffix] = df[f"{medium}_id"][i]
    return synonym_to_uid

In [None]:
@cache
def get_hardcoded_mapping(medium):
    fn = f"../../environment/animeplanet/{medium}_to_mal.csv"
    if os.path.exists(fn):
        df = pd.read_csv(fn)
    else:
        return {}
    title_to_uid = {}
    mal_to_uid = get_media_mapping(medium)
    for i in range(len(df)):
        if df.mal[i] in mal_to_uid:
            title_to_uid[df.animeplanet[i]] = mal_to_uid[df.mal[i]]
        else:
            logging.warning(f"Could not find {df.animeplanet[i]} -> {df.mal[i]}")
    return title_to_uid

# Save results

In [None]:
def write(medium, mapping):
    with open(f"../processed_data/animeplanet_{medium}_to_uid.csv", "w") as f:
        f.write(f"title,{medium}_id\n")
        for k, v in mapping.items():
            f.write(f"{k},{v}\n")

In [None]:
def write_unmapped(medium, keys, mapping):
    df = pd.DataFrame.from_dict(keys, orient="index").reset_index()
    df.columns = ["title", "count"]
    df = df.loc[lambda x: ~x.title.isin(mapping.keys())]
    fn = f"../../environment/animeplanet/{medium}_unmapped.csv"
    if os.path.exists(fn):
        seen = pd.read_csv(fn)
        df = df.loc[lambda x: ~x.title.isin(set(seen.animeplanet))]
    df = df.sort_values(by="count", ascending=False)
    df.to_csv(f"../processed_data/animeplanet_{medium}_unmapped.csv", index=False)

In [None]:
def generate_mapping(medium):
    keys = get_keys(medium)
    media_to_uid = {}
    for cutoff in [100, 85]:
        for mapping in [
            get_media_mapping(medium),
            get_database_mapping(medium),
            get_hardcoded_mapping(medium),
        ]:
            fuzzy_match(list(keys), mapping, media_to_uid, html_sanitize, cutoff)
    write(medium, media_to_uid)
    write_unmapped(medium, keys, media_to_uid)

In [None]:
def get_hardcoded_mapping(medium):
    fn = f"../../environment/animeplanet/{medium}_to_mal.csv"
    if os.path.exists(fn):
        df = pd.read_csv(fn, escapechar="\\")
    else:
        return {}
    title_to_uid = {}
    mal_to_uid = get_media_mapping(medium)
    for i in range(len(df)):
        if df.mal[i] in mal_to_uid:
            title_to_uid[df.animeplanet[i]] = mal_to_uid[df.mal[i]]
        else:
            logging.warning(f"Could not find {df.animeplanet[i]} -> {df.mal[i]}")
    return title_to_uid

In [None]:
for medium in ["anime", "manga"]:
    generate_mapping(medium)