In [5]:
import os
import csv
import re
import pdfplumber
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
import spacy
from nltk.corpus import stopwords
import nltk
from langdetect import detect

In [6]:
# Télécharger stopwords si première fois
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Charger modèles spaCy FR et EN
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

# Stopwords FR + EN
stop_words = set(stopwords.words("french") + stopwords.words("english"))

#  Extraction texte PDF textuel
def extract_text_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Extraction OCR PDF scanné (FR+EN)
def extract_text_ocr(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap(dpi=300)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img, lang="fra+eng")
    return text

# Nettoyage + Lemmatisation bilingue
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zàâçéèêëîïôûùüÿñæœ\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    try:
        lang = detect(text)
    except:
        lang = "fr"  # fallback
    if lang == "fr":
        doc = nlp_fr(text)
    else:
        doc = nlp_en(text)

    tokens = [token.lemma_ for token in doc if token.text not in stop_words and len(token.text) > 2]
    return " ".join(tokens)

#  Pipeline complet
def preprocess_document(file_path, force_ocr=False):
    text = ""
    if file_path.lower().endswith(".pdf") and not force_ocr:
        text = extract_text_pdf(file_path)
    if not text.strip():
        text = extract_text_ocr(file_path)
    return clean_text(text)

# Génération CSV
def build_csv(dataset_dir, output_csv="dataset.csv"):
    rows = []
    for label in os.listdir(dataset_dir):
        class_dir = os.path.join(dataset_dir, label)
        if not os.path.isdir(class_dir):
            continue
        for fichier in os.listdir(class_dir):
            if fichier.lower().endswith(".pdf"):
                chemin = os.path.join(class_dir, fichier)
                try:
                    texte = preprocess_document(chemin)
                    rows.append([texte, label])
                    print(f"[OK] {fichier} → {label}")
                except Exception as e:
                    print(f"[ERREUR] {fichier} : {e}")

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["texte", "label"])
        writer.writerows(rows)

    print(f"\n✅ Dataset sauvegardé dans {output_csv} ({len(rows)} documents)")

# Exemple d’utilisation
if __name__ == "__main__":
   build_csv("data1", "documents_pretraites.csv")

[ERREUR] facture_16_niveaux_16_niveaux_copy1.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_16_niveaux_copy2.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_2_niveaux_copy1.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_2_niveaux_copy2.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_4_niveaux_copy1.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_4_niveaux_copy2.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] facture_16_niveaux_8_niveaux_copy1.pdf : tesseract is not installed or it's not in your PATH. See README file for more information.
[ERREUR] fa