In [1]:
import fitz  # PyMuPDF
from PIL import Image
import os
import logging

In [2]:
# Configuração de logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [3]:
# Definição de diretórios
BASE_DIR = os.getcwd()
DIR_PAI = os.path.dirname(BASE_DIR)
DIR_DATA = os.path.join(DIR_PAI, "data")
DIR_DATA_RAW = os.path.join(DIR_DATA, "raw")
DIR_PDF_TO_IMAGE = os.path.join(DIR_DATA, "processed_pdf_to_images")
os.makedirs(DIR_PDF_TO_IMAGE, exist_ok=True)

In [4]:
def pdf_to_image(pdf_path, output_dir):
    """Converte todas as páginas de um PDF para imagens .jpg."""
    try:
        logging.info(f"Iniciando processamento do PDF: {pdf_path}")
        pdf_document = fitz.open(pdf_path)
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]  # Nome do PDF sem extensão
        num_pages = len(pdf_document)
        
        for page_num in range(num_pages):
            try:
                page = pdf_document[page_num]
                pix = page.get_pixmap()
                
                # Cria a imagem e salva como .jpg
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                output_file = os.path.join(output_dir, f"{pdf_name}_pag{page_num + 1}.jpg")
                img.save(output_file, "JPEG")
                
                logging.info(f"Página {page_num + 1}/{num_pages} salva em {output_file}")
            except Exception as page_error:
                logging.error(f"Erro ao processar a página {page_num + 1}: {page_error}")

        pdf_document.close()
        logging.info(f"Processamento concluído para o PDF: {pdf_path}")
    except Exception as pdf_error:
        logging.error(f"Erro ao abrir ou processar o PDF {pdf_path}: {pdf_error}")

In [5]:
if __name__ == "__main__":
    # Itera sobre arquivos na pasta de entrada
    for pdf_file in os.listdir(DIR_DATA_RAW):
        if pdf_file.lower().endswith(".pdf"):  # Garante que apenas PDFs sejam processados
            pdf_path = os.path.join(DIR_DATA_RAW, pdf_file)
            pdf_output_dir = os.path.join(DIR_PDF_TO_IMAGE, os.path.splitext(pdf_file)[0])
            os.makedirs(pdf_output_dir, exist_ok=True)  # Cria pasta específica para o PDF

            try:
                pdf_to_image(pdf_path, pdf_output_dir)
            except Exception as e:
                logging.error(f"Erro ao processar o arquivo {pdf_file}: {e}")

2024-12-09 09:14:33,167 - INFO - Iniciando processamento do PDF: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\raw\1 - Engine Control-Fuel System.pdf
2024-12-09 09:14:33,193 - INFO - Página 1/160 salva em c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\processed_pdf_to_images\1 - Engine Control-Fuel System\1 - Engine Control-Fuel System_pag1.jpg
2024-12-09 09:14:33,200 - INFO - Página 2/160 salva em c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\processed_pdf_to_images\1 - Engine Control-Fuel System\1 - Engine Control-Fuel System_pag2.jpg
2024-12-09 09:14:33,208 - INFO - Página 3/160 salva em c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\processed_pdf_to_images\1 - Engine Control-Fuel System\1 - Engine Control-Fuel System_pag3.jpg
2024-12-09 09:14:33,216 - INFO - Página 4/160 salva em c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\processed_pdf_to_images\1 - Engine Control-Fuel Sys