# Pipeline per estrarre tutte le tabelle e le immagini da un pdf e poi tutte le immagini e pdf estratti vengono processati per effettuare ocr tramite ollama modello granite

In [13]:
import logging
import os
import time
from pathlib import Path

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0

def main():
    logging.basicConfig(level=logging.INFO)

    # --- CONFIGURA QUI I PERCORSI (non usare __file__ in notebook/REPL) ---
    # base_dir = Path(__file__).parent                 # se esegui da file .py
    base_dir = Path.cwd()                              # se esegui da notebook/REPL
    pdf_folder = Path("/storage/data_4T_b/andreacutuli/PROVA/Documents/pdf_fac_simile")
    input_doc_path = pdf_folder / "FAC-SIMILE 2.pdf"
    output_dir = Path("/storage/data_4T_b/andreacutuli/PROVA/Documents/output_images")

    # controlli rapidi
    if not input_doc_path.exists():
        raise FileNotFoundError(f"File PDF non trovato: {input_doc_path}")

    # Keep page/element images so they can be exported.
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    #IO NON VORREI ESTRARRE ANCHE LE PAGINE MA SOLO TABELLE E IMMAGINI
    #MA SE IMPOSTO A FALSE NON MI ESTRAE NIENTE O COMUNQUE NON FUNZIONA BENE
    #QUINDI LASCIO TRUE E POI CANCELLERÒ LE PAGINE DOPO
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(str(input_doc_path))

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem if getattr(conv_res.input, "file", None) else input_doc_path.stem

    # Save page images
    # conv_res.document.pages potrebbe essere dict-like o list-like: gestiamo entrambi
    pages_obj = getattr(conv_res.document, "pages", None)
    if pages_obj is None:
        _log.warning("Nessuna proprietà pages trovata su conv_res.document")
    else:
        # se pages è dict (chiave = page_no) o lista
        try:
            iterator = list(pages_obj.items())  # dict-like
            is_dict = True
        except Exception:
            iterator = list(enumerate(pages_obj, start=1))  # list-like
            is_dict = False

        for key, page in iterator:
            # page potrebbe avere attributi page_no, image, ecc.
            page_no = getattr(page, "page_no", None) or (key if is_dict else key)
            page_image_filename = output_dir / f"{doc_filename}-page-{page_no}.png"
            try:
                # page.image può essere un oggetto che espone .pil_image o .save
                page_image = getattr(page, "image", None)
                if page_image is None:
                    _log.debug(f"Nessuna immagine pagina per page {page_no}")
                    continue
                pil_img = getattr(page_image, "pil_image", None)
                if pil_img is not None:
                    pil_img.save(page_image_filename, format="PNG")
                else:
                    # prova a chiamare save direttamente
                    page_image.save(str(page_image_filename), format="PNG")
            except Exception as e:
                _log.exception(f"Errore salvataggio immagine pagina {page_no}: {e}")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    try:
        iterator = conv_res.document.iterate_items()
    except Exception:
        iterator = []

    for element, _level in iterator:
        try:
            if isinstance(element, TableItem):
                table_counter += 1
                element_image_filename = output_dir / f"{doc_filename}-table-{table_counter}.png"
                img = element.get_image(conv_res.document)
                # get_image può restituire PIL Image o altro; proviamo a gestire entrambe
                if hasattr(img, "save"):
                    img.save(element_image_filename, format="PNG")
                else:
                    # se è un byte stream
                    with open(element_image_filename, "wb") as fp:
                        fp.write(img)
            elif isinstance(element, PictureItem):
                picture_counter += 1
                element_image_filename = output_dir / f"{doc_filename}-picture-{picture_counter}.png"
                img = element.get_image(conv_res.document)
                if hasattr(img, "save"):
                    img.save(element_image_filename, format="PNG")
                else:
                    with open(element_image_filename, "wb") as fp:
                        fp.write(img)
        except Exception as e:
            _log.exception(f"Errore salvataggio elemento {type(element)}: {e}")

    # Save markdown with embedded pictures
    try:
        md_filename = output_dir / f"{doc_filename}-with-images.md"
        conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
    except Exception as e:
        _log.exception(f"Errore salvataggio markdown embedded: {e}")

    # Save markdown with externally referenced pictures
    try:
        md_filename2 = output_dir / f"{doc_filename}-with-image-refs.md"
        conv_res.document.save_as_markdown(md_filename2, image_mode=ImageRefMode.REFERENCED)
    except Exception as e:
        _log.exception(f"Errore salvataggio markdown referenced: {e}")

    # Save HTML with externally referenced pictures
    try:
        html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
        conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)
    except Exception as e:
        _log.exception(f"Errore salvataggio html referenced: {e}")

    end_time = time.time() - start_time
    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
    _log.info(f"Saved pages: {page_no if 'page_no' in locals() else 'n/a'}, tables: {table_counter}, pictures: {picture_counter}")


    for nome_file in os.listdir(output_dir):
        percorso_file = os.path.join(output_dir, nome_file)
        if os.path.isfile(percorso_file) and "page" in nome_file:
            os.remove(percorso_file)
            print(f"Cancellato: {nome_file}")

if __name__ == "__main__":
    main()






2025-09-30 22:29:02,078 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 22:29:02,086 - INFO - Going to convert document batch...
2025-09-30 22:29:02,087 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 85ac70aecbb3e9d18a585ddbe6342108
2025-09-30 22:29:02,088 - INFO - Accelerator device: 'cuda:0'


2025-09-30 22:29:06,901 - INFO - Accelerator device: 'cuda:0'
2025-09-30 22:29:08,925 - INFO - Accelerator device: 'cuda:0'
2025-09-30 22:29:09,642 - INFO - Processing document FAC-SIMILE 2.pdf
2025-09-30 22:29:25,057 - INFO - Finished converting document FAC-SIMILE 2.pdf in 22.98 sec.
2025-09-30 22:29:28,319 - INFO - Document converted and figures exported in 26.24 seconds.
2025-09-30 22:29:28,320 - INFO - Saved pages: 16, tables: 4, pictures: 5


Cancellato: FAC-SIMILE 2-page-2.png
Cancellato: FAC-SIMILE 2-page-10.png
Cancellato: FAC-SIMILE 2-page-4.png
Cancellato: FAC-SIMILE 2-page-11.png
Cancellato: FAC-SIMILE 2-page-8.png
Cancellato: FAC-SIMILE 2-page-6.png
Cancellato: FAC-SIMILE 2-page-12.png
Cancellato: FAC-SIMILE 2-page-13.png
Cancellato: FAC-SIMILE 2-page-3.png
Cancellato: FAC-SIMILE 2-page-7.png
Cancellato: FAC-SIMILE 2-page-1.png
Cancellato: FAC-SIMILE 2-page-16.png
Cancellato: FAC-SIMILE 2-page-9.png
Cancellato: FAC-SIMILE 2-page-5.png
Cancellato: FAC-SIMILE 2-page-14.png
Cancellato: FAC-SIMILE 2-page-15.png


In [18]:
import os
import ollama

input_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/output_images"
descrizioni_dir = "/storage/data_4T_b/andreacutuli/PROVA/descrizioni"
os.makedirs(descrizioni_dir, exist_ok=True)

# 🔧 Modello Ollama
model = "granite3.2-vision"

# 🧠 Prompt testuale
prompt = "Describe what's in this image."

for nome_file in os.listdir(input_dir):
    percorso_file = os.path.join(input_dir, nome_file)

    # Controlla che sia un file e che sia un'immagine
    if os.path.isfile(percorso_file) and nome_file.lower().endswith((".png", ".jpg", ".jpeg")):
        print(f"\nProcessing image: {nome_file}")

        # 📤 Esegui inferenza con Ollama
        response = ollama.generate(
            model=model,
            prompt=prompt,
            images=[percorso_file]
        )

        descrizione = response['response']
        print("Description:", descrizione)

        # Salva la descrizione in file .txt
        txt_filename = os.path.splitext(nome_file)[0] + ".txt"
        txt_path = os.path.join(descrizioni_dir, txt_filename)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(descrizione)
        print(f"[OK] Descrizione salvata in {txt_path}")



Processing image: FAC-SIMILE 2-picture-4.png


2025-09-30 22:41:00,396 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image consists of five vertical black bars with white borders.
[OK] Descrizione salvata in /storage/data_4T_b/andreacutuli/PROVA/descrizioni/FAC-SIMILE 2-picture-4.txt

Processing image: FAC-SIMILE 2-picture-3.png


2025-09-30 22:41:01,548 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image consists of five vertical black bars against a white background.
[OK] Descrizione salvata in /storage/data_4T_b/andreacutuli/PROVA/descrizioni/FAC-SIMILE 2-picture-3.txt

Processing image: FAC-SIMILE 2-table-1.png


2025-09-30 22:41:39,776 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image appears to be a scanned page from a document or book, specifically focusing on a section titled "Sistema di Intelligenza Artificiale per l'Ottimizzazione delle Esperienze di Acquisto in Gestione Inventario." The text is organized into several sections and subsections, each detailing specific aspects of the project. Here's a detailed breakdown:

### Title Section
- **Title:** Sistema di Intelligenza Artificiale per l'Ottimizzazione delle Esperienze di Acquisto in Gestione Inventario
- **Subtitle:** Implementazione di Soluzioni Predictive per Personalizzazione e Gestione del Prodotto

### Main Sections and Subsections
1. **Introduzione**
   - **1.1. Scopo del Documento**
     - Description of the document's purpose.
   - **1.2. Obiettivi del Progetto**
     - Objectives of the project.
   - **1.3. Validità**
     - Validation criteria or methods.

2. **Specifiche di Progetto**
   - **2.1. Descrizione dell'Intervento**
     - Description of the intervention.
   - *

2025-09-30 22:41:40,895 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image consists of five vertical black bars against a white background.
[OK] Descrizione salvata in /storage/data_4T_b/andreacutuli/PROVA/descrizioni/FAC-SIMILE 2-picture-2.txt

Processing image: FAC-SIMILE 2-table-2.png


2025-09-30 22:41:57,853 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image depicts a table with two columns and four rows. Here is a detailed description of each component:

### Table Description

#### Columns:
1. **Name**: This column lists the names of individuals, presumably professionals or roles within an organization.
2. **Descrizione figura professionale**: This column provides descriptions of their professional figures or roles.

#### Rows:
1. **1 Chief Data Scientist**
   - **Description**: "1 Chief Data Scientist"
2. **1 Senior Data Engineer + PM**
   - **Description**: "1 Senior Data Engineer + PM"
3. **2+ ML Engineers**
   - **Description**: "2+ ML Engineers"

### Markdown Format Representation:
```markdown
| Name                | Descrizione figura professionale   |
|----------------------|-----------------------------------|
| 1 Chief Data Scientist | 1 Chief Data Scientist               |
| 1 Senior Data Engineer + PM | 1 Senior Data Engineer + PM           |
| 2+ ML Engineers       | 2+ ML Engineers                     

2025-09-30 22:42:07,179 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image displays a table with two columns labeled "Attività" (Activities) and "Costo [€]" (Cost [€]). The table is structured as follows:

| Attività | Costo [€] |
|----------|-----------|
| 1        | 32,000 €   |
| 2        | 24,000 €   |
| 3        | 22,000 €   |

### Detailed Description:

- **Title**: The table does not have a title. It is simply labeled "Attività" and "Costo [€]".
- **Columns**:
  - **Attività (Activities)**: This column lists the different activities or tasks being considered. In this case, there are three activities: 1, 2, and 3.
  - **Costo [€] (Cost [€])**: This column indicates the cost associated with each activity in euros. The costs for activities 1, 2, and 3 are listed as follows:
    - Activity 1: €32,000
    - Activity 2: €24,000
    - Activity 3: €22,000

### Analysis:

The table provides a clear comparison of the costs associated with three different activities. Here is an analysis based on the data presented:

1. **Activity 1**: The 

2025-09-30 22:42:27,789 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image depicts a bar chart with five distinct bars, each representing different categories or groups labeled as "WPs," "Mese 1," "Mese 2," "Mese 3," and "Mese 4." The x-axis is labeled "D1" and the y-axis is labeled "D2_D3." Each bar corresponds to a specific category, with the height of each bar indicating the value or quantity associated with that category.

Here's a detailed breakdown of the image:

### Description of Objects Present in the Image:

1. **Bars**:
   - There are five horizontal bars, each representing different categories.
   - The labels on the y-axis (D2_D3) indicate these categories.
   - Each bar is divided into smaller segments or rectangles within it.

2. **X-Axis Label**:
   - The x-axis is labeled "D1." This label suggests that the data points are grouped by a specific category or variable denoted as D1.

3. **Y-Axis Label**:
   - The y-axis is labeled "D2_D3," indicating that the values represented in each bar correspond to different subcatego

2025-09-30 22:42:28,926 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image consists of five vertical black bars against a white background.
[OK] Descrizione salvata in /storage/data_4T_b/andreacutuli/PROVA/descrizioni/FAC-SIMILE 2-picture-5.txt

Processing image: FAC-SIMILE 2-picture-1.png


2025-09-30 22:42:30,056 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Description: 
The image consists of five vertical black bars against a white background.
[OK] Descrizione salvata in /storage/data_4T_b/andreacutuli/PROVA/descrizioni/FAC-SIMILE 2-picture-1.txt
