# Build the Database

*In this notebook, we build up the Color No Kami's database.*

## Fetching Data

We rely on the [MangaDex API](https://api.mangadex.org/docs/) to source high-quality manga scans, forming the foundation of our dataset. The following code acts as our toolbox, allowing us to fetch, build and refine the database that powers Color No Kami’s U-Net model.

In [1]:
import logging
from typing import List, Tuple, Dict, Callable, Iterable, Any
import time
import httpx
import json
import os
import jmespath
from urllib.parse import urlencode
from PIL.Image import Image
from pymongo import MongoClient

logging.basicConfig(
    format="%(asctime)s-%(levelname)s: %(message)s",
    filename=f"../data/log/{time.time()}.log",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)


class LogStore(object):

    def __init__(self, path: str) -> None:
        self.client = MongoClient("localhost", 27017)
        self.db = self.client.ColorNoKami
        self.logs = self.db.log
        self.cache = {}
        self.timestamp = time.time()

    def is_logged(self, url: str) -> bool:
        return self.logs.find_one({"_id": url}) != None

    def register_log(self, url: str, data: Dict) -> None:
        self.cache[url] = data
        if (time.time() - self.timestamp) > 3:
            self.logs.insert_many([{"_id": k, **v} for k, v in self.cache.items()])
            self.cache = {}
            self.timestamp = time.time()

    def get_log(self, url) -> Dict:
        return self.logs.find_one({"_id": url})


class ApiClient(object):

    def __init__(self, cache: LogStore, request_per_minute: int = 30):
        self.cache = cache
        self.number_request = 0
        self.request_per_minute = request_per_minute
        self.timestamp = time.time()

    def get(self, url: str, rate: float = 60.0):
        if not self.cache.is_logged(url):

            self.number_request += 1
            if self.number_request > self.request_per_minute:
                sleep_duration = max(120.0 - (time.time() - self.timestamp), 0.0)
                logger.info(
                    f"limit request per minute reach wait for '{sleep_duration}'s"
                )
                time.sleep(sleep_duration)
                self.number_request = 1
                self.timestamp = time.time()

            time.sleep(0.5)
            response = httpx.get(url)
            if not response.status_code == 200:
                raise Exception(
                    f"Failed to contact api [{response.status_code}] {response.text},"
                )

            data_json = response.json()
            self.cache.register_log(url, data_json)
            return data_json
        else:
            return self.cache.get_log(url)


class MangaDataMng(object):

    def __init__(self, storage_path: str) -> None:
        self.data = []
        self.storage_path = storage_path
        self._load_cache()

    def _load_cache(self) -> None:
        file_path = f"{self.storage_path}/data.json"
        if not os.path.exists(file_path):
            return
        with open(file_path, "r") as file:
            content = file.read()
            if content == "":
                return
            decoder = json.JSONDecoder()
            self.data = decoder.decode(content)

    def has(self, data):
        id = data["id"]
        return jmespath.search(f"[?id == `{id}`] | [0] != null", self.data)

    def insert(self, data):
        self.data.append(data)
        with open(f"{self.storage_path}/data.json", "w") as file:
            encoder = json.JSONEncoder()
            file.write(encoder.encode(self.data))


class ApiData(object):

    def __init__(self, client: ApiClient, api_url: str, params: Dict[str, str] = {}):
        self.api_url = api_url
        self.params = params
        self.iterator = ApiDataIterator(client=client, api_url=api_url, params=params)

    def __iter__(self):
        return self.iterator


class ApiDataIterator(object):

    def __init__(self, client: ApiClient, api_url: str, params: Dict[str, str] = {}):
        self.client = client
        self.offset = 0
        self.total = 0
        self.api_url = api_url
        self.params = params
        self.data = []
        self.index = 0
        self.finish_pulling = False
        self._reload_cache()

    def _reload_cache(self):
        if len(self.data) <= (self.index + 1):
            self.params["offset"] = self.offset
            url = f"{self.api_url}?{urlencode(self.params)}"
            data_json = self.client.get(url)
            self.offset += len(data_json["data"])
            self.total = data_json["total"]
            self.data = data_json["data"]
            self.index = 0
            if self.offset >= self.total:
                self.finish_pulling = True

    def value(self) -> Dict[str, Any]:
        return self.data[self.index]

    def __next__(self) -> None:
        self._reload_cache()
        self.index += 1
        if self.finish_pulling and self.index >= len(self.data):
            raise StopIteration()
        return self.data[self.index]


class ImagesDatabank(object):

    def __init__(self, storage_path):
        self.storage_path = storage_path

    def register(self, host: str, hash: str, name: str) -> Image:
        path = f"{self.storage_path}/{hash}_{name}"
        if os.path.exists(path):
            return ImagePIL.open(path)
        else:
            time.sleep(2)
            request = httpx.get(f"{host}/data/{hash}/{name}")
            image = ImagePIL.open(BytesIO(request.content))
            image.save(path)
            return image


base_url = "https://api.mangadex.org"
cache = LogStore(path="../data")
api = ApiClient(cache)
mng = MangaDataMng("../data")
img_databank = ImagesDatabank("../data/images_candidates")
logger.info("Start logging")

client = MongoClient("localhost", port=27017)
db = client.ColorNoKami
logs = db.log


First we iterates over the MangaDex API to selectively cache full-color manga locally.

In [None]:
[
    _
    for _ in ApiData(
        api,
        f"{base_url}/manga",
        params={"title": "Official color", "limit": "100", "order[createdAt]": "asc"},
    )
]
print(f"Number of full color books '{logs.count_documents({})}'")

Request all full-color books from the log database, filter to include only those with an attached monochrome version, and remove duplicates to ensure a unique list. Store the final result in a new collection named "book."

In [None]:

logs.aggregate(
    [
        {"$unwind": {"path": "$data"}},
        {"$unwind": {"path": "$data.relationships"}},
        {
            "$match": {
                "data.relationships.type": "manga",
                "data.relationships.related": "monochrome",
            }
        },
        {
            "$project": {
                "_id": "$data.id",
                "monochrome_id": "$data.relationships.id",
                "title": "$data.attributes.title.en",
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "monochrome_id": {"$first": "$monochrome_id"},
                "title": {"$first": "$title"},
            }
        },
        {"$out": "books"},
    ]
)

print(f"There are '{db.books.count_documents({})}' books in the database have full color and monochromic version")

We apply the same process to monochrome versions of manga that correspond to previously fetched full-color manga, ensuring we have both versions locally cached for comparison and further processing. Then, we fetched their respective chapters.

In [4]:
books = list(db.books.find())

for book in books:
    official_color_id = book["_id"]
    monochrome_id = book["monochrome_id"]
    [
        _
        for _ in ApiData(
            api,
            f"{base_url}/manga/{monochrome_id}/feed",
            params={"order[createdAt]": "asc", "limit": "500"},
        )
    ]
    [
        _
        for _ in ApiData(
            api,
            f"{base_url}/manga/{official_color_id}/feed",
            params={"order[createdAt]": "asc", "limit": "500"},
        )
    ]

Retrieve chapters from the logs, clean the data to ensure consistency, and remove any duplicates to obtain a unique set of chapters.

In [None]:
db.log.aggregate(
    [
        {
            "$match": {
                "_id": {"$regex": "https\:\/\/api.mangadex.org\/manga\/(.*)\/feed"},
                "data": {"$elemMatch": {"$ne": "null"}},
            },
        },
        {"$unwind": "$data"},
        {
            "$addFields": {
                "returnMatch": {
                    "$regexFind": {
                        "input": "$_id",
                        "regex": "https\:\/\/api.mangadex.org\/manga\/(.*)\/feed",
                    }
                }
            }
        },
        {"$unwind": "$returnMatch.captures"},
        {
            "$project": {
                "_id": "$data.id",
                "manga_id": "$returnMatch.captures",
                "volume": "$data.attributes.volume",
                "chapter": "$data.attributes.chapter",
                "title": "$data.attributes.title",
                "translatedLanguage": "$data.attributes.translatedLanguage",
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "manga_id": {"$first": "$manga_id"},
                "volume": {"$first": "$volume"},
                "chapter": {"$first": "$chapter"},
                "translatedLanguage": {"$first": "$translatedLanguage"},
            }
        },
        {"$out": "chapters"},
    ]
)

print(f"There are '{db.chapters.count_documents({})}' in the database")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pprint

df = pd.DataFrame(data = list(db.chapters.find()))
lang = df["translatedLanguage"]
lang_cnt = lang.value_counts()
strong_lang_cnt = lang_cnt[lang_cnt > lang_cnt.mean()]
wanted_languages = strong_lang_cnt[:4].index.to_list()
wanted_languages

Join each fullcolor book with its corresponding chapters, excluding any unused chapters

In [None]:
db.books.aggregate(
    [
        {
            "$lookup": {
                "from": "chapters",
                "localField": "_id",
                "foreignField": "manga_id",
                "as": "chroma_data",
            }
        },
        {
            "$lookup": {
                "from": "chapters",
                "localField": "monochrome_id",
                "foreignField": "manga_id",
                "as": "mono_data",
            }
        },
        {
            "$match": {
                "chroma_data": {"$elemMatch": {"$ne": "null"}},
                "mono_data": {"$elemMatch": {"$ne": "null"}},
            }
        },
        {"$unwind": "$chroma_data"},
        {"$unwind": "$mono_data"},
        {
            "$match": {
                "$expr": {
                    "$and": [
                        {
                            "$eq": [
                                "$chroma_data.translatedLanguage",
                                "$mono_data.translatedLanguage",
                            ]
                        },
                        {
                            "$eq": [
                                "$chroma_data.volume",
                                "$mono_data.volume",
                            ]
                        },
                        {
                            "$eq": [
                                "$chroma_data.chapter",
                                "$mono_data.chapter",
                            ]
                        }
                    ]
                },
            }
        },
        {
            "$project": {
                "_id": {
                    "$concat": [
                        "$title",
                        " - ",
                        "$chroma_data.volume",
                        " - ",
                        "$chroma_data.chapter",
                        " - ",
                        "$chroma_data.translatedLanguage",
                    ]
                },
                "chapter": "$chroma_data.chapter",
                "volume": "$chroma_data.volume",
                "title": "$title",
                "monochrome.id": "$mono_data._id",
                "monochrome.manga_id": "$mono_data.manga_id",
                "monochrome.translatedLanguage": "$mono_data.translatedLanguage",
                "colorfull.id": "$chroma_data._id",
                "colorfull.manga_id": "$chroma_data.manga_id",
                "colorfull.translatedLanguage": "$chroma_data.translatedLanguage",
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "volume": {"$first": "$volume"},
                "title": {"$first": "$title"},
                "chapter": {"$first": "$chapter"},
                "monochrome": {"$first": "$monochrome"},
                "colorfull": {"$first": "$colorfull"},
            }
        },
        {"$sort": {"_id": 1}},
        {"$out": "paired_chapters"},
    ],
)


print(f"There are '{db.paired_chapters.count_documents({})}' chapters")

In [2]:
for paired_chapters in list(db.paired_chapters.find({})):
    monochrome_id = paired_chapters["monochrome"]["id"]
    colorfull_id = paired_chapters["colorfull"]["id"]
    api.get(f"{base_url}/at-home/server/{monochrome_id}")
    api.get(f"{base_url}/at-home/server/{colorfull_id}")

Exception: Failed to contact api [404] {"result":"error","errors":[{"id":"770a6444-c695-570d-bfd2-a0eec2e95ce9","status":404,"title":"not_found_http_exception","detail":"Chapter with ID 7b317532-7408-49fe-9f19-3b197afae49b not found.","context":null}]},