# Build the Database

*In this notebook, we build up the Color No Kami's database.*

## 1. Fetching Data
----------------

We rely on the [MangaDex API](https://api.mangadex.org/docs/) to source high-quality manga scans, forming the foundation of our dataset. The following code acts as our toolbox, allowing us to fetch, build and refine the database that powers Color No Kami’s U-Net model.

In [14]:
""" 
 # What my application does ?

 build my database for a UNet.
 - tasks fetching
 - task restructuration des données
 
 I want comic book page paired by colored and monochrome.

# What is my domain ? 
 
"""

from pymongo import MongoClient

from database_crafting.log.mongo_log_repository import MongoLogRepository
from database_crafting.network_client.log_client import LogClient
from database_crafting.image_databank.databank import ImagesDatabank
from database_crafting.api_data.manga_dex_api_data_factory import MangaDexApiDataFactory


db_client = MongoClient("localhost", port=27017)
db = db_client.ColorNoKami
log_repo = MongoLogRepository(db)
log_client = LogClient(log_repo)
api_data_factory = MangaDexApiDataFactory(log_client)
img_databank = ImagesDatabank("./data/images_candidates")

### 1.1 Fetching book's data
----------------------------

We iterates over the MangaDex API to selectively cache full-color manga locally.

In [15]:
from database_crafting.book.mongo_book_repository import MongoBookRepository

book_repo = MongoBookRepository(db)
book_repo.fetch_books(api_data_factory)


print(f"Number of log '{log_repo.count_log()}'")
print(f"Number of books '{book_repo.get_books_count()}'")

Number of log '19297'
Number of books '96'


Request all full-color books from the log database, filter to include only those with an attached monochrome version, and remove duplicates to ensure a unique list. Store the final result in a new collection named "book."

### 1.2 Fetch chapters' data
-----------------------

We then fetched all the chapters feed in monochromic and full-color version for each book.

In [16]:
for book in book_repo.get_books():
    api_data_factory.create_api_data("chapter_feed", params=[book.monochrome_id]).fetch()
    api_data_factory.create_api_data("chapter_feed", params=[book.fullcolor_id]).fetch()
    

Build up the database collection "chapters" from the previous cached logs. Assured, there are no duplicates.

In [None]:
db.log.aggregate(
    [
        {
            "$match": {
                "_id": {"$regex": "https\:\/\/api.mangadex.org\/manga\/(.*)\/feed"},
                "data": {"$elemMatch": {"$ne": None}},
            },
        },
        {"$unwind": "$data"},
        {
            "$addFields": {
                "returnMatch": {
                    "$regexFind": {
                        "input": "$_id",
                        "regex": "https\:\/\/api.mangadex.org\/manga\/(.*)\/feed",
                    }
                }
            }
        },
        {"$unwind": "$returnMatch.captures"},
        {
            "$project": {
                "_id": "$data.id",
                "manga_id": "$returnMatch.captures",
                "volume": "$data.attributes.volume",
                "chapter": "$data.attributes.chapter",
                "title": "$data.attributes.title",
                "translatedLanguage": "$data.attributes.translatedLanguage",
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "manga_id": {"$first": "$manga_id"},
                "volume": {"$first": "$volume"},
                "chapter": {"$first": "$chapter"},
                "translatedLanguage": {"$first": "$translatedLanguage"},
            }
        },
        {"$out": "chapters"},
    ]
)

print(f"There are '{db.chapters.count_documents({})}' in the database")

db.books.aggregate(
    [
        {
            "$lookup": {
                "from": "chapters",
                "let": {
                    "full_color_book_id": "$_id",
                    "monochrome_book_id": "$monochrome_id",
                    "book_title": "$title",
                },
                "pipeline": [
                    {
                        "$match": {
                            "$expr": {
                                "$or": [
                                    {"$eq": ["$$full_color_book_id", "$manga_id"]},
                                    {"$eq": ["$$monochrome_book_id", "$manga_id"]},
                                ]
                            }
                        }
                    },
                    {
                        "$group": {
                            "_id": {
                                "$concat": [
                                    "$volume",
                                    " - ",
                                    "$chapter",
                                    " - ",
                                    "$translatedLanguage",
                                ]
                            },
                            "volume": {"$first": "$volume"},
                            "chapter": {"$first": "$chapter"},
                            "translatedLanguage": {"$first": "$translatedLanguage"},
                            "chapter_version": {
                                "$push": {
                                    "id": "$_id",
                                    "is_fullcolor": {
                                        "$expr": {
                                            "$cond": {
                                                "if": {
                                                    "$eq": [
                                                        "$$full_color_book_id",
                                                        "$manga_id",
                                                    ]
                                                },
                                                "then": True,
                                                "else": False,
                                            }
                                        }
                                    },
                                }
                            },
                        },
                    },
                    {
                        "$match": {
                            "$and": [
                                {
                                    "chapter_version": {
                                        "$elemMatch": {"is_fullcolor": True}
                                    }
                                },
                                {
                                    "chapter_version": {
                                        "$elemMatch": {"is_fullcolor": False}
                                    }
                                },
                            ]
                        },
                    },
                ],
                "as": "chapters",
            },
        },
        {"$unwind": {"path": "$chapters"}},
        {
            "$match": {
                "$and": [
                    {"title": {"$ne": None}},
                    {"chapters._id": {"$ne": None}},
                    {"chapters.chapter_version": {"$size": 2}},
                ]
            }
        },
        {
            "$project": {
                "_id": {"$concat": ["$title", "$chapters._id"]},
                "full_color_id": "$_id",
                "monochrome_id": "$monochrome_id",
                "title": "$title",
                "chapter_info": {
                    "id": "$chapters._id",
                    "volume": "$chapters.volume",
                    "chapter": "$chapters.chapter",
                    "translatedLanguage": "$chapters.translatedLanguage",
                    "chapter_version": {
                        "$sortArray": {
                            "input": "$chapters.chapter_version",
                            "sortBy": {"is_fullcolor": -1},
                        }
                    },
                },
            }
        },
        {"$out": "chapters"},
    ]
)

print(f"There are '{db.chapters.count_documents({})}' chapters")

### 1.3. Fetch page's url
-------------------------

Cached all the pages url for each chapters already retrieved.

In [None]:
chapters = list(db.chapters.find({}))
chapters_cnt = len(chapters)
for index, chapter in enumerate(chapters):
    colorfull_id = chapter["chapter_info"]["chapter_version"][0]["id"]
    monochrome_id = chapter["chapter_info"]["chapter_version"][1]["id"]
    try:
        log_client.get(f"{base_url}/at-home/server/{monochrome_id}")
        log_client.get(f"{base_url}/at-home/server/{colorfull_id}")
    except Exception as error:
        print(error)
    print(f"Progress {index}/{chapters_cnt}", end="\r", flush=True)
print("\nDone", flush=True)

In [61]:
db.log.aggregate(
    [
        {
            "$match": {
                "_id": {
                    "$regex": "https\:\/\/api.mangadex.org\/at-home\/server\/(.*)\/?"
                },
            },
        },
        {
            "$addFields": {
                "returnMatch": {
                    "$regexFind": {
                        "input": "$_id",
                        "regex": "https\:\/\/api.mangadex.org\/at-home\/server\/(.*)\/?",
                    }
                }
            }
        },
        {
            "$project": {
                "id": {"$arrayElemAt": ["$returnMatch.captures", 0]},
                "baseUrl": 1,
                "hash": "$chapter.hash",
                "url": "$chapter.data",
            }
        },
        {"$out": "page_urls"},
    ]
)

db.chapters.aggregate(
    [
        {
            "$lookup": {
                "from": "page_urls",
                "let": {
                    "chapter_fullcolor": {
                        "$arrayElemAt": ["$chapter_info.chapter_version", 0]
                    },
                    "chapter_monochrome": {
                        "$arrayElemAt": ["$chapter_info.chapter_version", 1]
                    },
                },
                "pipeline": [
                    {
                        "$match": {
                            "$expr": {
                                "$or": [
                                    {"$eq": ["$$chapter_fullcolor.id", "$id"]},
                                    {"$eq": ["$$chapter_monochrome.id", "$id"]},
                                ]
                            }
                        },
                    },
                    {
                        "$project": {
                            "id": 1,
                            "_id": 0,
                            "hash": 1,
                            "baseUrl": 1,
                            "is_fullcolor": {
                                "$expr": {
                                    "$cond": {
                                        "if": {
                                            "$eq": [
                                                "$$chapter_fullcolor.id",
                                                "$id",
                                            ]
                                        },
                                        "then": True,
                                        "else": False,
                                    }
                                }
                            },
                            "url": "$url",
                        }
                    },
                    {"$sort": {"is_fullcolor": -1}},
                ],
                "as": "data",
            }
        },
        {
            "$match": {
                "data": {
                    "$not": {"$elemMatch": {"url": {"$exists": False}}},
                },
                "data.0": {"$exists": True},
                "data.1": {"$exists": True},
            }
        },
        {
            "$addFields": {
                "chapter_info.chapter_version": {
                    "$map": {
                        "input": {
                            "$range": [0, {"$size": "$chapter_info.chapter_version"}]
                        },
                        "as": "idx",
                        "in": {
                            "$cond": {
                                "if": {"$eq": ["$$idx", 0]},
                                "then": {
                                    "$mergeObjects": [
                                        {
                                            "$arrayElemAt": [
                                                "$chapter_info.chapter_version",
                                                "$$idx",
                                            ]
                                        },
                                        {"$arrayElemAt": ["$data", 0]},
                                    ]
                                },
                                "else": {
                                    "$arrayElemAt": [
                                        "$chapter_info.chapter_version",
                                        "$$idx",
                                    ]
                                },
                            }
                        },
                    }
                }
            }
        },
        {
            "$addFields": {
                "chapter_info.chapter_version": {
                    "$map": {
                        "input": {
                            "$range": [0, {"$size": "$chapter_info.chapter_version"}]
                        },
                        "as": "idx",
                        "in": {
                            "$cond": {
                                "if": {"$eq": ["$$idx", 1]},
                                "then": {
                                    "$mergeObjects": [
                                        {
                                            "$arrayElemAt": [
                                                "$chapter_info.chapter_version",
                                                "$$idx",
                                            ]
                                        },
                                        {"$arrayElemAt": ["$data", 1]},
                                    ]
                                },
                                "else": {
                                    "$arrayElemAt": [
                                        "$chapter_info.chapter_version",
                                        "$$idx",
                                    ]
                                },
                            }
                        },
                    }
                }
            }
        },
        {
            "$project": {
                "volume": "$chapter_info.volume",
                "chapter": "$chapter_info.chapter",
                "translatedLanguage": "$chapter_info.translatedLanguage",
                "monochromic_version": {
                    "$arrayElemAt": ["$chapter_info.chapter_version", 0]
                },
                "fullcolor_version": {
                    "$arrayElemAt": ["$chapter_info.chapter_version", 1]
                },
            }
        },
        {"$out": "chapters"},
    ]
)

db.page_urls.drop()

In [None]:
chapters = list(db.chapters.find({}))
chapters_cnt = len(chapters)


def register_urls(chapter, index):
    url_nbr = len(chapter["url"])
    for idx, url in enumerate(chapter["url"]):
        print(
            f"Total progress {index+1}/{chapters_cnt}, Download {idx+1}/{url_nbr}           ",
            end="\r",
            flush=True,
        )
        img_databank.register(chapter["baseUrl"], chapter["hash"], url)


for index, chapter in enumerate(chapters):
    register_urls(chapter["fullcolor_version"], index)
    register_urls(chapter["monochromic_version"], index)

print("\nDone", flush=True)

In [None]:
# count number of chapter already download
chapters = list(db.chapters.find({}))
chapters_cnt = len(chapters)

def count_total_download(chapter, index):
    if not "url" in chapter:
        pprint.pp(chapter)
    url_nbr = len(chapter["url"])
    total_download = 0
    for idx, url in enumerate(chapter["url"]):
        print(
            f"Total progress {index+1}/{chapters_cnt}, Download {idx+1}/{url_nbr}           ",
            end="\r",
            flush=True,
        )
        total_download += 1 if img_databank.has(chapter["baseUrl"], chapter["hash"], url) else 0
    return total_download
        
total_download = 0
for index, chapter in enumerate(chapters):
    total_download += count_total_download(chapter["fullcolor_version"], index)
    total_download += count_total_download(chapter["monochromic_version"], index)
    
print(f"\n\nDone, total download: {total_download}", flush=True)

In [None]:
import matplotlib.pyplot as plt
import pprint

chapters = list(db.chapters.find({}))
filled_chapter = None

for index, chapter in enumerate(chapters):
    for chapter_version in chapter["chapter_info"]["chapter_version"]:
        if not "url" in chapter_version:
            continue
        for idx, url in enumerate(chapter_version["url"]):
            if img_databank.has(chapter_version["baseUrl"], chapter_version["hash"], url):
                filled_chapter = chapter
                break
        if filled_chapter:
            break
    if filled_chapter:
            break
        
pprint.pp(filled_chapter)