In [None]:
# %pip install pdfplumber

In [1]:
import pdfplumber
import re
import os

# 📂 Ton fichier PDF
pdf_path = "book/le_proces_kafka.pdf"  # ← remplace par le chemin réel
output_dir = "mp3"
os.makedirs(output_dir, exist_ok=True)

# 📤 Extraire le texte du PDF
def extract_clean_text(pdf_path, start_page=1, end_page=None):
    """
    Extrait le texte du PDF entre les pages spécifiées (inclusivement).
    Les numéros de page commencent à 1 (comme pour un lecteur PDF).

    :param pdf_path: Chemin vers le fichier PDF
    :param start_page: Page de début (1-indexée)
    :param end_page: Page de fin (1-indexée, inclus)
    :return: Texte extrait et nettoyé
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        num_pages = len(pdf.pages)
        
        # Ajuste les bornes si non spécifiées ou dépassées
        start_idx = max(0, start_page - 1)
        end_idx = min(end_page if end_page else num_pages, num_pages)

        for i in range(start_idx, end_idx):
            page = pdf.pages[i]
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n\n"  # Double saut de ligne = délimite paragraphes

    return text


# 🧹 Nettoyer les caractères invisibles et HTML potentiels
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # supprime tags HTML/XML
    text = re.sub(r'[\u200b\u200c\u200d\u2028\u2029]', '', text)  # supprime unicode invisibles
    text = ''.join(c for c in text if c.isprintable())  # filtre caractères non imprimables
    return text.strip()

# 📄 Traitement
raw_text = extract_clean_text(pdf_path, start_page=1, end_page=316)#221, end_page=244)
page_content = clean_text(raw_text)

# 🗣️ Synthèse vocale en morceaux pour éviter les surcharges
from pathlib import Path
import subprocess
import shutil

temp_dir = os.path.join(output_dir, "chunks")
os.makedirs(temp_dir, exist_ok=True)

# 🧠 Découpe en paragraphes ou chunks (CHUNKS pour le rate qui est ignorer quand les chunks ou paragraphes sont trop courts)
# chunk_size = 5000
# chunks = [page_content[i:i + chunk_size] for i in range(0, len(page_content), chunk_size)]

paragraphs = [p.strip() for p in page_content.split("\n\n") if p.strip()]
mp3_files = []

for i, paragraph in enumerate(paragraphs):
    m4a_path = os.path.join(temp_dir, f"chunk_{i}.m4a")
    mp3_path = os.path.join(temp_dir, f"chunk_{i}.mp3")

    print(f"[{i+1}/{len(paragraphs)}] Synthèse en cours...")

    try:
        subprocess.run([
            "say", "-v", "Amélie", "-r", "150",
            "-o", m4a_path,
            "--file-format=m4af",
            paragraph
        ], check=True)

        subprocess.run([
            "ffmpeg", "-y", "-i", m4a_path,
            "-q:a", "0", "-map", "a", mp3_path
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        mp3_files.append(mp3_path)
    except Exception as e:
        print(f"Erreur lors du traitement du paragraphe {i}: {e}")

# 🔗 Concaténation en un seul MP3
concat_list = os.path.join(temp_dir, "concat_list.txt")
# with open(concat_list, "w", encoding="utf-8") as f:
#     for mp3 in mp3_files:
#         f.write(f"file '{mp3}'\n")
with open(concat_list, "w", encoding="utf-8") as f:
    for mp3 in mp3_files:
        rel_path = os.path.relpath(mp3, start=os.path.dirname(concat_list))
        f.write(f"file '{rel_path}'\n")

final_output = os.path.join(output_dir, "le_proces_kafka.mp3")
subprocess.run([
    "ffmpeg", "-f", "concat", "-safe", "0", "-i", concat_list,
    "-c", "copy", final_output
], check=True)

print(f"✅ Livre audio généré : {final_output}")

# 🧹 Nettoyage
shutil.rmtree(temp_dir)


[1/1] Synthèse en cours...


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

✅ Livre audio généré : mp3/le_proces_kafka.mp3


[out#0/mp3 @ 0x15763a2c0] video:0KiB audio:225493KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.000132%
size=  225493KiB time=05:57:24.98 bitrate=  86.1kbits/s speed=9.59e+03x    
