In [4]:
!pip install moviepy



In [10]:
import os
import re
import json
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

# --- Fungsi ekstraksi teks dari satu ePub ---
def extract_text_from_epub(file_path):
    book = epub.read_epub(file_path)
    text = ''
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.content, 'html.parser')
            text += soup.get_text()
    return text

# --- Fungsi cleaning teks ---
def clean_text(raw_text):
    text = raw_text.replace('\xa0', ' ').replace('\u200f', '')  # Karakter khusus
    text = re.sub(r'\s+', ' ', text)  # Hapus spasi ganda, baris kosong
    text = text.strip()
    return text

# --- Fungsi split teks jadi chunk (untuk LLM/analisis selanjutnya) ---
def split_into_chunks(text, max_words=500):
    words = text.split()
    chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    return chunks

# --- Fungsi utama untuk banyak file ePub ---
def process_all_epubs(epub_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    results = {}  # Semua hasil ke dict

    for filename in os.listdir(epub_folder):
        if filename.endswith(".epub"):
            book_id = os.path.splitext(filename)[0]
            print(f"Proses: {filename}")

            full_path = os.path.join(epub_folder, filename)
            raw_text = extract_text_from_epub(full_path)
            cleaned_text = clean_text(raw_text)
            chunks = split_into_chunks(cleaned_text, max_words=500)

            # Simpan ke .txt
            output_txt = os.path.join(output_folder, f"{book_id}.txt")
            with open(output_txt, "w", encoding="utf-8") as f:
                for i, chunk in enumerate(chunks):
                    f.write(f"[Chunk {i+1}]\n{chunk}\n\n")

            # Simpan ke dict
            results[book_id] = chunks

    # Simpan semua ke satu file .json
    output_json = os.path.join(output_folder, "epub_chunks.json")
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print("Preprocessing selesai. File disimpan di:", output_folder)

# --- Jalankan program ---
epub_input_folder = "epubs"  # Ganti sesuai folder kamu
output_clean_folder = "output/cleaned_text"
process_all_epubs(epub_input_folder, output_clean_folder)


Proses: الأربعون الشبابية.epub
Proses: اللآليء المكية من كلام خير البرية.epub
Proses: بداية القاري في ختم صحيح البخاري -.epub
Proses: تيسير اللطيف المنان في خلاصة تفسير القرآن - ط الأوقاف السعودية.epub
Proses: معالم السنة النبوية -.epub
Preprocessing selesai. File disimpan di: output/cleaned_text
