In [2]:
import json
import os
from pathlib import Path
from typing import Any

import requests
import requests_cache
from bs4 import BeautifulSoup
from loguru import logger

requests_cache.install_cache("data/mal_cache", backend="sqlite", expire_after=86400)

In [3]:
BASE_DIR = Path(os.path.abspath("")).resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"
META_DIR = BASE_DIR / "data" / "metadata"
SUMMARY_DIR = BASE_DIR / "data" / "summaries"

META_DIR.mkdir(parents=True, exist_ok=True)
SUMMARY_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)


JIKAN_BASE = "https://api.jikan.moe/v4"

In [15]:
def save_data(url: str, data: dict[str, Any]) -> None:
    # url example: DIR / f"{process_query}_{page}.json"
    if not data:
        return
    with open(url, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)  # Handle Japanese


def fetch_metadata_from_myanimelist(query: str) -> list[dict[str, Any]]:
    logger.info(f"[+] Searching MAL: {query:6>}")
    resp = requests.get(f"{JIKAN_BASE}/anime", params={"q": query, "limit": 20})
    resp.raise_for_status()
    result = resp.json()

    # TODO: Check for pagination
    pagination = result["pagination"]
    page = 1
    process_query = query.replace(" ", "-").replace("/", "_")
    with open(RAW_DIR / f"{process_query}_{page}.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    animes_data = result["data"]
    return animes_data


def filter_anime_metadata(animes_data: dict[str, Any]) -> dict[str, Any]:
    animes_data = [
        r
        for r in animes_data
        if r["type"].lower() in {"tv", "movie", "ova", "special", "tv_special"}
    ]
    return animes_data


def _extract_synopsis_from_mal(html: str) -> str | None:
    """Extracts the synopsis block following <h2>Synopsis</h2>."""
    soup = BeautifulSoup(html, "html.parser")

    try:
        header = soup.find("h2", string=lambda t: t and "Synopsis" in t)
        if not header:
            logger.warning("Synopsis header not found")
            return None
        synopsis_div = header.find_parent("div")
        if synopsis_div:
            header.extract()
            synopsis_text = synopsis_div.get_text(separator=" ", strip=True)
            return synopsis_text or None

        logger.warning("No parent div found for synopsis header")
        return None

    except Exception:
        logger.exception("Error parsing synopsis HTML")
        return None


def fetch_episode_synopsis(episode_url: str) -> str | None:
    """Fetches the synopsis of a specific anime episode by scraping HTML.

    Args:
        anime_url: Base URL of the anime page.

    Returns:
        The cleaned synopsis string if found, otherwise None.
    """
    if not episode_url:
        return None
    try:
        resp = requests.get(episode_url, timeout=10)
    except requests.RequestException as e:
        logger.error(
            "Network error fetching episode {exc}", exc=str(e)
        )
        return None
    if resp.status_code != 200:
        logger.warning(
            f"Failed to fetch episode {episode_url} â€” Status {resp.status_code}",
        )
        return None

    synopsis = _extract_synopsis_from_mal(resp.text)
    if not synopsis:
        logger.info(f"No synopsis found for episode at {episode_url}")
        return None
    return synopsis



def fetch_episodes(mal_id: int) -> list[dict[str, Any]]:
    episodes = []
    page = 1
    while True:
        logger.info(f"[+] Searching MAL Episodes: {mal_id:6} - Page {page:2}")
        url = f"{JIKAN_BASE}/anime/{mal_id}/episodes?page={page}"
        resp = requests.get(url)
        resp.raise_for_status()
        data = resp.json()
        if not data.get("data"):
            break
        for ep in data["data"]:
            synopsis = fetch_episode_synopsis(ep["url"])
            ep["synopsis"] = synopsis
        episodes.extend(data["data"])
        if not data.get("pagination", {}).get("has_next_page"):
            break
        page += 1
    return episodes

In [16]:
animes_data = fetch_metadata_from_myanimelist("Kaguya Sama Love is War!")
animes_data = filter_anime_metadata(animes_data)

[32m2025-07-08 14:27:34.259[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_metadata_from_myanimelist[0m:[36m10[0m - [1m[+] Searching MAL: Kaguya Sama Love is War![0m


In [17]:
for anime in animes_data:
    mal_id = anime["mal_id"]
    anime_url = anime["url"]
    episodes = fetch_episodes(mal_id)
    data = {"summary": anime, "episodes": episodes}
    save_data(url=META_DIR / f"{mal_id}.json", data=data)

[32m2025-07-08 14:27:34.914[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  37999 - Page  1[0m
[32m2025-07-08 14:27:35.233[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  40591 - Page  1[0m
[32m2025-07-08 14:27:35.524[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  43608 - Page  1[0m
[32m2025-07-08 14:27:35.773[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  52198 - Page  1[0m
[32m2025-07-08 14:27:35.775[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  43609 - Page  1[0m
[32m2025-07-08 14:27:35.777[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_episodes[0m:[36m93[0m - [1m[+] Searching MAL Episodes:  23229 - Page  1[0m
[32m2025-07-08 14:27: