# üìò Extraction du texte des fichiers PDF avec source et nettoyage

In [None]:
# Importer les biblioth√®ques n√©cessaires
import fitz  # PyMuPDF
import os
import pandas as pd
from tqdm import tqdm

# üìÅ Dossiers
pdf_folder = "../data/pdf_books"
output_folder = "../data/extracted_texts"
os.makedirs(output_folder, exist_ok=True)

# üìÑ Fonction d'extraction
def extract_text_from_pdf(pdf_path, source_name):
    doc = fitz.open(pdf_path)
    text_chunks = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text").strip()
        if text:  # Ignore les pages vides
            text_chunks.append({
                "source": source_name,
                "page": page_num + 1,
                "text": text
            })
    return pd.DataFrame(text_chunks)

# üîÅ Boucle sur tous les PDF
for filename in tqdm(os.listdir(pdf_folder)):
    if filename.endswith(".pdf"):
        full_path = os.path.join(pdf_folder, filename)
        source_name = filename.replace(".pdf", "")
        df = extract_text_from_pdf(full_path, source_name)
        # Sauvegarde en CSV
        csv_name = source_name + ".csv"
        df.to_csv(os.path.join(output_folder, csv_name), index=False)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:03<00:00,  1.98it/s]


# üßº Pr√©traitement avant l‚Äôembedding

1. Nettoyage du texte :
    - supprimer les caract√®res sp√©ciaux, sauts de page, num√©ros de page, titres r√©p√©titifs
    - uniformiser les espaces et ponctuations
    - supprimer les lignes trop courtes ou non informatives (ex. : ‚ÄúChapitre 1‚Äù, ‚ÄúPage 12‚Äù)

In [3]:
import pandas as pd
import re

def clean_text(text):
    text = re.sub(r"\n+", " ", text)  # remplace les sauts de ligne
    text = re.sub(r"\s{2,}", " ", text)  # supprime les espaces multiples
    text = re.sub(r"Page \d+", "", text)  # supprime les mentions de page
    return text.strip()


2. Chunking intelligent:
    - diviser les textes en blocs coh√©rents (‚âà 100‚Äì300 mots)
    - utiliser les sauts de paragraphe ou la ponctuation comme rep√®res
    - ajouter des m√©tadonn√©es : source, num√©ro de chunk, page d‚Äôorigine

In [5]:
def chunk_text(text, max_words=200):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, current_chunk = [], []
    word_count = 0

    for sentence in sentences:
        words = sentence.split()
        word_count += len(words)
        current_chunk.append(sentence)
        if word_count >= max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk, word_count = [], 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks



3. Structuration finale - cr√©er un DataFrame avec les colonnes :
    - source
    - page
    - chunk_id
    - text_clean
    - text_chunked

In [None]:
all_chunks = []

for file in os.listdir("../data/extracted_texts"):
    if file.endswith(".csv"):
        df = pd.read_csv(f"../data/extracted_texts/{file}")
        source = file.replace(".csv", "")
        for _, row in df.iterrows():
            cleaned = clean_text(row["text"])
            chunks = chunk_text(cleaned)
            for i, chunk in enumerate(chunks):
                all_chunks.append({
                    "source": source,
                    "page": row["page"],
                    "chunk_id": f"{source}_p{row['page']}_c{i}",
                    "text": chunk
                })

df_chunks = pd.DataFrame(all_chunks)
df_chunks.to_csv("../data/ready_for_embedding/chunks.csv", index=False)


In [9]:
# V√©rifier le nombre de chunks cr√©√©s
print(f"Nombre total de chunks cr√©√©s : {len(df_chunks)}")

Nombre total de chunks cr√©√©s : 3316
