# <h1 align="center"><font color="gree">Usando Docling para extrair dados de um PDF</font></h1>

<font color="pink">Senior Data Scientist.: Dr. Eddy Giusepe Chirinos Isidro</font>

## <font color="red">Usando ``RapidOcrOptions``</font>

In [4]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling_core.types.doc import ImageRefMode


pipeline_options = PdfPipelineOptions(
    do_ocr=True,                    # ‚úÖ Habilitar OCR em imagens
    ocr_options=RapidOcrOptions(
        lang=['pt', 'en'], # Idiomas a serem utilizados para OCR
        text_score=0.70, # 60% de confian√ßa para extrair texto
        print_verbose=True, # Imprimir informa√ß√µes de OCR
        force_full_page_ocr=False,  # Seletivo: OCR apenas em imagens
        bitmap_area_threshold=0.02,  # 5% da √°rea da p√°gina
    ),
    do_table_structure=True,        # ‚úÖ Detectar estrutura de tabelas
    generate_picture_images=True,   # ‚úÖ Extrair imagens do PDF
    do_picture_classification=True, # ‚úÖ Classificar imagens
    do_picture_description=False, # ‚ùå Desabilitar descri√ß√£o (evita texto extra de imagens)
    images_scale=0.5, # 50% da escala da imagem
    do_code_enrichment=True, # ‚úÖ Extrair c√≥digo
    do_formula_enrichment=True, # ‚úÖ Extrair f√≥rmulas matem√°ticas
    
)


In [5]:
# Criar o conversor com as op√ß√µes de OCR:
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
converter = DocumentConverter(
    allowed_formats=[InputFormat.PDF],
    format_options={InputFormat.PDF: pdf_format_option}
)

# Converter o PDF
result = converter.convert(
    "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/Data_Science_Eddy_pt.pdf"
)

2025-11-15 13:14:29,661 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


2025-11-15 13:14:29,668 - INFO - Going to convert document batch...
2025-11-15 13:14:29,670 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 9a82d51d92de0cb48082bfa594e46939
2025-11-15 13:14:29,671 - INFO - Accelerator device: 'cpu'
2025-11-15 13:14:29,930 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-15 13:14:29,941 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-15 13:14:29,946 [RapidOCR] download_file.py:60: File exists and is valid: /home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-15 13:14:29,948 [RapidOCR] main.py:53: Using /home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-15 13:14:30,023 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 202

In [6]:
markdown_output = result.document.export_to_markdown(
    #image_placeholder="[üñºÔ∏è IMAGEM]",  # Marca√ß√£o para figuras/fotos (n√£o texto)
    escape_html=True,
    escape_underscores=True,
    #indent=4,
    enable_chart_tables=True,
    image_mode=ImageRefMode.PLACEHOLDER,
    include_annotations=True,
    mark_annotations=False,
    page_break_placeholder=None

)

# Salvar o markdown em arquivo
output_path = "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/eddy_data_science_pt.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_output)
    

## <font color="red">Usando ``EasyOcrOptions``</font>

In [None]:
from docling.datamodel.pipeline_options import EasyOcrOptions

pipeline_options = PdfPipelineOptions(
    do_ocr=True,                    # ‚úÖ Habilitar OCR em imagens
    ocr_options=EasyOcrOptions( #RapidOcrOptions(
        lang=['pt', 'en'],
        use_gpu=False,
        #text_score=0.6, # 60% de confian√ßa para extrair texto
        #print_verbose=True, # Imprimir informa√ß√µes de OCR
        force_full_page_ocr=False,  # Seletivo: OCR apenas em imagens
        bitmap_area_threshold=0.02,  # 5% da √°rea da p√°gina
    ),
    do_table_structure=True,        # ‚úÖ Detectar estrutura de tabelas
    generate_picture_images=True,   # ‚úÖ Extrair imagens do PDF
    do_picture_classification=True, # ‚úÖ Classificar imagens
    do_picture_description=True, # ‚úÖ Gerar descri√ß√£o de imagens
    images_scale=0.5, # 50% da escala da imagem
    do_code_enrichment=True, # ‚úÖ Extrair c√≥digo
    do_formula_enrichment=True, # ‚úÖ Extrair f√≥rmulas matem√°ticas
    
)

# Criar o conversor com as op√ß√µes de OCR:
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
converter = DocumentConverter(
    allowed_formats=[InputFormat.PDF],
    format_options={InputFormat.PDF: pdf_format_option}
)

# Converter o PDF
result = converter.convert(
    "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/Data_Science_Eddy_pt.pdf"
)

markdown_output = result.document.export_to_markdown(
    #image_placeholder="[üñºÔ∏è IMAGEM]",  # Marca√ß√£o para figuras/fotos (n√£o texto)
    escape_html=True,
    escape_underscores=True,
    indent=4,
    enable_chart_tables=True,
    image_mode=ImageRefMode.PLACEHOLDER,
    include_annotations=True,
    mark_annotations=False,
    page_break_placeholder=None

)

# Salvar o markdown em arquivo
output_path = "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/eddy_data_science_pt.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_output)

## <font color="red">Mais exemplos de uso</font>

In [None]:
from docling.document_converter import DocumentConverter
from utils.sitemap import get_sitemap_urls

converter = DocumentConverter(
  
)

# --------------------------------------------------------------
# Basic PDF extraction
# --------------------------------------------------------------

result = converter.convert(
    "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/Data_Science_Eddy_pt.pdf"
)

document = result.document
markdown_output = document.export_to_markdown()

# Salvando em Markdown:
output_path = "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/eddy_data_science_pt.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_output)


In [None]:
import json

json_output = document.export_to_dict()

# Salvando em JSON:
output_path = "/home/eddygiusepe/2_GitHub/Building_Knowledge_Extraction_Pipeline_with_Docling/data/eddy_data_science_pt.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(json_output, f, indent=4)


In [None]:
# --------------------------------------------------------------
# Basic HTML extraction
# --------------------------------------------------------------

result = converter.convert("https://github.com/docling-project/docling")

document = result.document
markdown_output = document.export_to_markdown()
print(markdown_output)

# --------------------------------------------------------------
# Scrape multiple pages using the sitemap
# --------------------------------------------------------------

sitemap_urls = get_sitemap_urls("https://github.com/docling-project/docling")
conv_results_iter = converter.convert_all(sitemap_urls)

docs = []
for result in conv_results_iter:
    if result.document:
        document = result.document
        docs.append(document)

print(docs)