# Raw data directory

In [28]:
data_dir = "../data/raw/"
data_dir = data_dir.rstrip("/")
print(f"data_dir: {data_dir}")  # Output: ../data/raw

data_dir: ../data/raw


# Elinimate manga that is not existed in both EN and VI

In [35]:
import os

languages = ["en", "vi"]

manga_sets = {}
for lang in languages:
    lang_dir = os.path.join(data_dir, lang)
    if os.path.exists(lang_dir):
        manga_sets[lang] = set(
            name for name in os.listdir(lang_dir)
            if os.path.isdir(os.path.join(lang_dir, name))
        )
    else:
        manga_sets[lang] = set()

# Find manga not present in both languages
only_in_en = manga_sets["en"] - manga_sets["vi"]
only_in_vi = manga_sets["vi"] - manga_sets["en"]
missing_titles = {
    "en": only_in_en,
    "vi": only_in_vi
}

print("Manga only in EN:", missing_titles["en"])
print("Manga only in VI:", missing_titles["vi"])

Manga only in EN: set()
Manga only in VI: {'test_data', 'Ghost Fixers', 'Night Light Hounds', 'The Marshal King', 'Empyreal Cabinet', 'Ultimate Exorcist Kiyoshi'}


In [None]:
import os
import shutil
from datetime import datetime

missing_title_dir = os.path.join(os.path.dirname(data_dir), "missing_titles")
print(missing_title_dir)
languages = ["en", "vi"]
missing_title_dir_log_path = os.path.join(missing_title_dir, "history_log.txt")

for lang in languages:
    os.makedirs(os.path.join(missing_title_dir, lang), exist_ok=True)
    for manga in missing_titles[lang]:
        src = os.path.join(data_dir, lang, manga)
        dst = os.path.join(missing_title_dir, lang, manga)
        if os.path.exists(src):
            shutil.move(src, dst)
            log_entry = f"{datetime.now().isoformat()} - {manga} → {lang}\n"
            print(log_entry)
            with open(missing_title_dir_log_path, "a") as log_file:
                log_file.write(log_entry)

../data\missing_titles
2025-11-05T17:36:20.224021 - test_data → vi

2025-11-05T17:36:20.225134 - Ghost Fixers → vi

2025-11-05T17:36:20.225610 - Night Light Hounds → vi

2025-11-05T17:36:20.226058 - The Marshal King → vi

2025-11-05T17:36:20.226560 - Empyreal Cabinet → vi

2025-11-05T17:36:20.227162 - Ultimate Exorcist Kiyoshi → vi



# Eliminate manga with un-equal number of chapters.

In [None]:
import os
import shutil
from datetime import datetime 

missing_chapter_dir = os.path.join(os.path.dirname(data_dir), "missing_chapters")
languages = ["en", "vi"]
manga_common = set(os.listdir(os.path.join(data_dir, "en"))) & set(os.listdir(os.path.join(data_dir, "vi")))
missing_chapter_dir_log_path = os.path.join(missing_chapter_dir, "history_log.txt")

for lang in languages:
    os.makedirs(os.path.join(missing_chapter_dir, lang), exist_ok=True)

for manga in manga_common:
    character_dict = {}
    for lang in languages:
        character_dict[lang] = os.listdir(os.path.join(data_dir, lang, manga))

    if character_dict["en"] != character_dict["vi"]:
        for lang in languages:
            src = os.path.join(data_dir, lang, manga)
            dst = os.path.join(missing_chapter_dir, lang, manga)
            shutil.move(src, dst)
            log_entry = f"{datetime.now().isoformat()} - {manga} → {lang}\n"
            print(log_entry)
            with open(missing_chapter_dir_log_path, "a") as log_file:
                log_file.write(log_entry)


2025-11-05T17:45:15.596389 - Kurokami Seiso no Reikoku Bishoujo o Tasuketara, Ore to Futarikiri no Toki dake Dereru You ni Natta Ken → en

2025-11-05T17:45:15.597398 - Kurokami Seiso no Reikoku Bishoujo o Tasuketara, Ore to Futarikiri no Toki dake Dereru You ni Natta Ken → vi

2025-11-05T17:45:15.598463 - Kimi wa Ore no Yasashikunai Haru → en

2025-11-05T17:45:15.599007 - Kimi wa Ore no Yasashikunai Haru → vi

2025-11-05T17:45:15.609617 - Kaji Daikou no Arubaito wo Hajimetara Gakuen Ichi no Bishoujo no Kazoku ni Kiniirarechaimashita → en

2025-11-05T17:45:15.610433 - Kaji Daikou no Arubaito wo Hajimetara Gakuen Ichi no Bishoujo no Kazoku ni Kiniirarechaimashita → vi

2025-11-05T17:45:15.614050 - Eclair - Anata ni Hibiku Yuri Anthology → en

2025-11-05T17:45:15.614735 - Eclair - Anata ni Hibiku Yuri Anthology → vi

2025-11-05T17:45:15.615598 - Tonari no Wakao-san wa Miesou de Mienai → en

2025-11-05T17:45:15.616067 - Tonari no Wakao-san wa Miesou de Mienai → vi

2025-11-05T17:45:15.6183

# Eliminate manga with un-equal number of page per chapter 

In [42]:
import os
import shutil
from datetime import datetime

unequal_page_dir = os.path.join(os.path.dirname(data_dir), "missing_pages")
languages = ["en", "vi"]
manga_common = set(os.listdir(os.path.join(data_dir, "en"))) & set(os.listdir(os.path.join(data_dir, "vi")))
unequal_page_dir_log_path = os.path.join(unequal_page_dir, "history_log.txt")

for lang in languages:
    os.makedirs(os.path.join(unequal_page_dir, lang), exist_ok=True)

def list_dirs(path):
    return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]


for manga in manga_common:
    # Get chapter lists for both languages
    en_chapters = list_dirs(os.path.join(data_dir, "en", manga))
    vi_chapters = list_dirs(os.path.join(data_dir, "vi", manga))
    # Only check chapters that exist in both
    common_chapters = set(en_chapters) & set(vi_chapters)
    unequal = False
    for chapter in common_chapters:
        en_pages = os.listdir(os.path.join(data_dir, "en", manga, chapter))
        vi_pages = os.listdir(os.path.join(data_dir, "vi", manga, chapter))
        if len(en_pages) != len(vi_pages):
            unequal = True
            break
    if unequal:
        for lang in languages:
            src = os.path.join(data_dir, lang, manga)
            dst = os.path.join(unequal_page_dir, lang, manga)
            if os.path.exists(src):
                shutil.move(src, dst)
                log_entry = f"{datetime.now().isoformat()} - {manga} → {lang}\n"
                print(log_entry)
                with open(unequal_page_dir_log_path, "a") as log_file:
                    log_file.write(log_entry)

2025-11-05T18:20:49.435912 - Koi yori Aoku → en

2025-11-05T18:20:49.436905 - Koi yori Aoku → vi

2025-11-05T18:20:49.438058 - Soshiki no Shukuteki to Kekkon Shitara Mecha Amai → en

2025-11-05T18:20:49.438941 - Soshiki no Shukuteki to Kekkon Shitara Mecha Amai → vi

2025-11-05T18:20:49.440309 - Imokusa Reijou desu ga Akuyaku Reisoku wo Tasuketara Kiniiraremashita → en

2025-11-05T18:20:49.441014 - Imokusa Reijou desu ga Akuyaku Reisoku wo Tasuketara Kiniiraremashita → vi

2025-11-05T18:20:49.441817 - FateSamurai Remnant → en

2025-11-05T18:20:49.442288 - FateSamurai Remnant → vi

2025-11-05T18:20:49.443110 - Nibanme na Boku to Ichiban no Kanojo → en

2025-11-05T18:20:49.443534 - Nibanme na Boku to Ichiban no Kanojo → vi

2025-11-05T18:20:49.444434 - Okinawa de Suki ni Natta Ko ga Hougen Sugite Tsura Sugiru → en

2025-11-05T18:20:49.444869 - Okinawa de Suki ni Natta Ko ga Hougen Sugite Tsura Sugiru → vi

2025-11-05T18:20:49.448618 - Shy → en

2025-11-05T18:20:49.449185 - Shy → vi

2025