# Mangadex

The command below is used to run on Window, Linux will have different command

In [None]:
import os
import subprocess
import random


def download_test_set(path_to_dataset_folder="./"):
    vi_folder = os.path.join(path_to_dataset_folder, "vi")
    en_folder = os.path.join(path_to_dataset_folder, "en")
    os.makedirs(vi_folder, exist_ok=True)
    os.makedirs(en_folder, exist_ok=True)
    print("Downloading images from MangaDex...")

    for order, link in enumerate(mangadex_links):
        for lang, out_dir in [("en", en_folder), ("vi", vi_folder)]:
            args = [
                "mangadex-dl",
                link,
                "--path",
                os.path.join(out_dir, "{manga.title}"),
                "--no-group-name",
                # "--replace",
                "--log-level",
                "WARNING",
                "--use-compressed-image",
                "--cover",
                "none",
                "--language",
                lang,
                "--delay-requests",
                f"{(round(random.uniform(0.5, 0.75), 2))}",
            ]
            try:
                subprocess.run(args, check=True)
                print(
                    f"Finsihed {lang.upper()} {order+1}/{len(mangadex_links)}: {link}"
                )
                continue
            except subprocess.CalledProcessError as e:
                print(
                    f"Error {lang.upper()} {order+1}/{len(mangadex_links)}: {link} {e}"
                )
                continue

    print(f"Manga saved to {path_to_dataset_folder}")


mangadex_links = [
    # "https://mangadex.org/title/0a2fd70d-e4de-49fb-8e81-89311d46b329/ruri-dragon-oneshot",
    "https://mangadex.org/title/63f2efed-e0cf-4ad1-98a9-7fba7f5848c1/the-raise",
]

# saved_folder = "../data"
saved_folder = "./"
download_test_set(path_to_dataset_folder=saved_folder)

Downloading images from MangaDex...
Finsihed EN 1/1: https://mangadex.org/title/63f2efed-e0cf-4ad1-98a9-7fba7f5848c1/the-raise
Finsihed VI 1/1: https://mangadex.org/title/63f2efed-e0cf-4ad1-98a9-7fba7f5848c1/the-raise
Manga saved to ./


CLi version

In [None]:
!mangadex-dl https://mangadex.org/title/63f2efed-e0cf-4ad1-98a9-7fba7f5848c1/the-raise --language "vi" --path "./VN/{manga.title}" --cover "none"

[INFO] Checking url = https://mangadex.org/title/63f2efed-e0cf-4ad1-98a9-7fba7f5848c1/the-raise
[INFO] Using Vietnamese language
[INFO] Fetching all chapters...
[INFO] Download directory is set to "D:\Downloads\Tu_Lieu\Cao_Hoc\Master_Thesis\master-thesis\utils\VN\The Raise"
[INFO] Using raw format
[INFO] Downloading cover manga The Raise
[INFO] Not downloading cover manga, since "cover" is none
[INFO] Getting images from chapter 1
[INFO] Downloading [Senukin] Chapter. 1 page 1

File sizes:   0%|                                   | 0.00/1.55M [00:00<?, ?B/s]
File sizes:  13%|███▍                       | 197k/1.55M [00:00<00:01, 1.19MB/s]
File sizes:  36%|█████████▊                 | 565k/1.55M [00:00<00:00, 2.34MB/s]
File sizes:  77%|███████████████████▉      | 1.19M/1.55M [00:00<00:00, 3.88MB/s]
File sizes: 100%|██████████████████████████| 1.55M/1.55M [00:00<00:00, 3.73MB/s]
[INFO] Getting images from chapter 2
[INFO] Downloading [Senukin] Chapter. 2 page 1

File sizes:   0%|          

# Custom Script to download data that has both Vietnam and English language

API approach to list all mange titles

In [None]:
import requests
import time
import random
import json
import os

CHECKPOINT_FILE_MANGA = f"../mangadexdl_checkpoint/preprocess/valid_manga_en_vi_{int(time.time())}.txt"  # include unix time in seconds


def load_checkpoint():
    """Return the last 'offset' written in RESULT_FILE (or 0 if none)."""
    if not os.path.exists(CHECKPOINT_FILE_MANGA):
        return 0, 0
    with open(CHECKPOINT_FILE_MANGA, "r", encoding="utf-8") as f:
        lines = f.readlines()
        if not lines:
            return 0, 0
        last_entry = json.loads(lines[-1])
        return last_entry["offset"], last_entry["number"]


def get_manga_with_en_vi():
    base_url = "https://api.mangadex.org/manga"
    limit = 100
    results = []
    offset_checkpoint, number_checkpoint = load_checkpoint()
    offset = offset_checkpoint
    manually_delay = 0

    while True:
        params = {
            "originalLanguage[]": "ja",
            "contentRating[]": "safe",
            "availableTranslatedLanguage[]": ["vi"],
            "limit": limit,
            "offset": offset,
        }
        r = requests.get(base_url, params=params)
        r.raise_for_status()
        data = r.json()

        # Continue with checkpoint due to too many requests
        with open(CHECKPOINT_FILE_MANGA, "a", encoding="utf-8") as f:
            for manga in data["data"]:
                langs = manga["attributes"].get("availableTranslatedLanguages", [])
                # Keep only if BOTH en and vi are present
                if "en" in langs and "vi" in langs:
                    title = (
                        manga["attributes"]["title"].get("en")
                        or list(manga["attributes"]["title"].values())[0]
                    )
                    manga_id = manga["id"]
                    results.append((manga_id, title))
                    # Write each result as a JSON line
                    total_data_collected = number_checkpoint + len(results)
                    f.write(
                        json.dumps(
                            {
                                "offset": offset + limit,
                                "number": total_data_collected,
                                "id": manga_id,
                                "full_data": manga,
                            },
                            ensure_ascii=False,
                        )
                        + "\n"
                    )

        # Stop when no more data
        total = data["total"]
        offset += limit
        print(
            f"offset: {offset}, total: {total}, collected from beginning of the run: {len(results)}"
        )
        if offset >= total:  # ✅ stop at the end
            break

        # ✅ Sleep to avoid hammering API
        manually_delay += 1

        if manually_delay % 5 == 0:
            time.sleep(10)  # sleep every 10 requests for x seconds
        else:
            delay = round(random.uniform(2.5, 4), 2)
            time.sleep(delay)

    return results


if __name__ == "__main__":
    mangas = get_manga_with_en_vi()
    print(f"Found total of {len(mangas)} manga with EN+VI translations")
    print(f"Found {len(mangas)} manga with EN+VI translations")

offset: 100, total: 3800, collected from beginning of the run: 99
offset: 200, total: 3800, collected from beginning of the run: 197
offset: 300, total: 3800, collected from beginning of the run: 290
offset: 400, total: 3800, collected from beginning of the run: 385
offset: 500, total: 3800, collected from beginning of the run: 479
offset: 600, total: 3800, collected from beginning of the run: 568
offset: 700, total: 3800, collected from beginning of the run: 658
offset: 800, total: 3800, collected from beginning of the run: 747
offset: 900, total: 3800, collected from beginning of the run: 833
offset: 1000, total: 3800, collected from beginning of the run: 927
offset: 1100, total: 3800, collected from beginning of the run: 1021
offset: 1200, total: 3800, collected from beginning of the run: 1112
offset: 1300, total: 3800, collected from beginning of the run: 1193
offset: 1400, total: 3800, collected from beginning of the run: 1276
offset: 1500, total: 3800, collected from beginning of

Postprocess manga file list (delete duplicate entries)

In [None]:
input_path = "../mangadexdl_checkpoint/preprocess/valid_manga_en_vi_1760159521.txt"
output_path = input_path.replace("preprocess", "postprocess").replace(
    ".txt", "_dedup.txt"
)

seen_ids = set()
deleted_ids = []

with open(input_path, "r", encoding="utf-8") as infile, open(
    output_path, "w", encoding="utf-8"
) as outfile:
    for line in infile:
        if not line.strip():
            continue
        try:
            entry = json.loads(line)
            manga_id = entry["id"]
        except Exception:
            # If line is just an ID string, not JSON
            manga_id = line.strip()
        if manga_id not in seen_ids:
            seen_ids.add(manga_id)
            outfile.write(line)
        else:
            deleted_ids.append(manga_id)

print("Deleted duplicate IDs:")
for did in deleted_ids:
    print(did)

Deleted duplicate IDs:
61d53bb7-b868-4ab2-ac1f-d309581e275e


API approach to download chapters

In [1]:
import json


def load_manga_ids(txt_path):
    manga_ids = []
    with open(txt_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                entry = json.loads(line)
                manga_ids.append(entry["id"])
    return list(dict.fromkeys(manga_ids))  # eliminate duplicates


# Example usage:
# txt_file = "../mangadexdl_checkpoint/valid_manga_en_vi_1.txt"
manga_id_list = load_manga_ids(
    "../mangadexdl_checkpoint/postprocess/valid_manga_en_vi_1760159521_dedup.txt"
)
print(f"manga_id_list ({len(manga_id_list)} manga titles): {manga_id_list}")

manga_id_list (3382 manga titles): ['0cfcda3b-0dcb-459b-9931-aa823e7bf403', '9cd31469-d050-4523-b1eb-e019cd7f65d0', '0b1598ed-6ac2-4c74-92cf-44fd0f3ab7fe', '778b8438-af3a-4534-841c-adf923b55ee1', '71772ad7-505f-4180-a317-583034890a6c', '56901ae8-aece-4983-96c3-94751c96a8bb', 'ed996855-70de-449f-bba2-e8e24224c14d', '3486c56e-47db-4d62-a9b4-71ea44acbaec', '5d36a83a-ccce-4410-a4c0-c80b16f3656a', 'fe1394fe-3d67-48fb-b452-06580d2dd9d7', '30f3ac69-21b6-45ad-a110-d011b7aaadaa', 'df361a38-eef6-4674-bec6-86d08aa1d1aa', '657ccfcc-5847-40f7-8859-5631eeeb3784', 'a6ccbfd7-4f32-4566-9e56-187189984a61', '1ee97895-4796-4bcf-bcd1-5ef99c011f8b', 'fa442671-f5ef-4397-93c4-0560b9a3a278', '4e54f039-7cc2-4748-b814-51336d68e821', '044fec5a-405c-4651-9f79-9794c7ec2e2c', '6d922c2c-f118-424c-8d21-78d9b24f25b5', 'f5218b50-8846-44ec-b4d5-fba2ae2fff54', 'c53caf9a-95b6-4400-b41b-d1b09167ef7a', '50491535-8d93-4f37-a885-14441766bd43', '196e48dd-7e08-4fed-ae66-f07f7c634c30', 'ffa0d510-e605-4bbe-8017-4295837718aa', 'aa4

In [None]:
import os
import requests
import subprocess
import time
import json
import random

CHECKPOINT_FILE_CHAPTER = (
    "../mangadexdl_checkpoint/preprocess/valid_chapter_en_vi_0.txt"
)


def get_last_checkpoint_number(checkpoint_path):
    if not os.path.exists(checkpoint_path):
        return 0
    with open(checkpoint_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        if not lines:
            return 0
        last_entry = json.loads(lines[-1])
        return last_entry.get("number", 0)  # default to 0 if not found


def get_common_chapters(manga_id):
    base_url = "https://api.mangadex.org/chapter"
    chapters = {"en": [], "vi": []}
    limit = 100
    offset = 0

    while True:
        filters = {
            "manga": manga_id,
            "translatedLanguage[]": ["en", "vi"],
            "limit": limit,
            "offset": offset,
        }
        r = requests.get(base_url, params=filters)
        r.raise_for_status()
        data = r.json()

        for ch in data["data"]:
            chap_num = ch["attributes"].get("chapter")
            lang = ch["attributes"].get("translatedLanguage")
            if chap_num and lang in chapters:
                chapters[lang].append(chap_num)

        # Stop when no more chapters
        if len(data["data"]) < limit:
            break
        offset += limit

    # Intersection → only chapters that exist in both en and vi
    common_chapters = [ch for ch in chapters["en"] if ch in chapters["vi"]]
    common_chapters_dedup = list(
        dict.fromkeys(common_chapters)
    )  # remove duplicates while preserving order
    print(f"Found {len(common_chapters_dedup)} common chapters for manga ID {manga_id}")
    return common_chapters_dedup


def download_chapter(manga_url, chapter_num, lang, out_dir):
    if chapter_num == "0":
        # temp fix because if start and end chapter are both "0", it downloads all chapters
        start_chapter = "0"
        end_chapter = "0.1"
    else:
        start_chapter = chapter_num
        end_chapter = chapter_num
    args = [
        "mangadex-dl",
        manga_url,
        "--path",
        os.path.join(out_dir, "{manga.title}"),
        "--no-group-name",
        # "--replace",
        "--log-level",
        "WARNING",
        "--use-compressed-image",
        "--cover",
        "none",
        "--language",
        lang,
        "--start-chapter",
        start_chapter,
        "--end-chapter",
        end_chapter,
        "--no-group-name",
        "--delay-requests",
        f"{(round(random.uniform(0.2, 0.3), 2))}",
    ]
    subprocess.run(args, check=True)


def download_dataset(path_to_dataset_folder="./"):
    vi_folder = os.path.join(path_to_dataset_folder, "vi")
    en_folder = os.path.join(path_to_dataset_folder, "en")
    os.makedirs(vi_folder, exist_ok=True)
    os.makedirs(en_folder, exist_ok=True)
    manually_delay = 0
    checkpoint_number = get_last_checkpoint_number(CHECKPOINT_FILE_CHAPTER)
    print(
        f"Resuming from checkpoint number {checkpoint_number} with manga_id {manga_id_list[checkpoint_number-1]}"
    )

    with open(CHECKPOINT_FILE_CHAPTER, "a", encoding="utf-8") as f:
        for number, manga_id in enumerate(manga_id_list):
            if number < checkpoint_number:
                continue  # Skip already processed manga IDs

            print(f"----------Processing: {number+1}/{len(manga_id_list)}----------")
            common_chapters = get_common_chapters(manga_id)
            print(f"Common chapters: {common_chapters}")

            if not common_chapters:
                print(f"No common chapters for {manga_id}")

            for ch in common_chapters:
                for lang, out_dir in [("en", en_folder), ("vi", vi_folder)]:
                    link = f"https://mangadex.org/title/{manga_id}"
                    try:
                        download_chapter(link, ch, lang, out_dir)
                        print(f"Downloaded {lang.upper()} chapter {ch} from {manga_id}")
                    except subprocess.CalledProcessError as e:
                        print(
                            f"Failed {lang.upper()} chapter {ch} from {manga_id}: {e}. Error output: {e.output}"
                        )

                manually_delay += 1
                if manually_delay % 5 == 0:
                    time.sleep(4)  # sleep every 10 requests for x seconds
                else:
                    delay = round(random.uniform(1.5, 2.5), 2)
                    time.sleep(delay)

            # Write log process
            f.write(
                json.dumps(
                    {
                        "number": number + 1,
                        "id": manga_id,
                        "total_common_chapter": len(common_chapters),
                        "common_chapter": common_chapters,
                    },
                    ensure_ascii=False,
                )
                + "\n"
            )
            f.flush()  # Flush should make the file update immediately after each manga_id loop


download_dataset("../data")

Resuming from checkpoint number 21 with manga_id c53caf9a-95b6-4400-b41b-d1b09167ef7a
----------Processing: 22/3382----------
Found 10 common chapters for manga ID 50491535-8d93-4f37-a885-14441766bd43
Common chapters: ['0', '1.1', '1.2', '2.1', '2.2', '3.1', '3.2', '4.1', '4.2', '5']
Downloaded EN chapter 0 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded VI chapter 0 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded EN chapter 1.1 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded VI chapter 1.1 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded EN chapter 1.2 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded VI chapter 1.2 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded EN chapter 2.1 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded VI chapter 2.1 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded EN chapter 2.2 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded VI chapter 2.2 from 50491535-8d93-4f37-a885-14441766bd43
Downloaded EN chapter 3.1 from 50491535

Counting how many 

In [None]:
import json

input_path = "../mangadexdl_checkpoint/preprocess/valid_chapter_en_vi_0.txt"

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        entry = json.loads(line)
        number = entry.get("number")
        chapters = entry.get("common_chapter", [])
        print(f"Number {number}: {len(chapters)} common chapters")

In [2]:
import json

input_path = "../mangadexdl_checkpoint/preprocess/valid_chapter_en_vi_0.txt"
output_path = "../mangadexdl_checkpoint/preprocess/valid_chapter_en_vi_0_noerror.txt"

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        if not line.strip():
            continue
        entry = json.loads(line)
        if "error_chapter" in entry:
            del entry["error_chapter"]
        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")