# Pipeline per estrarre tutte le tabelle e le immagini da un pdf e poi tutte le immagini e OCR ocr tramite ollama modello granite.

In [None]:
import logging
import os
import time
from pathlib import Path

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0

def main():
    logging.basicConfig(level=logging.INFO)

    # cartella PDF in input
    pdf_folder = Path("/storage/data_4T_b/andreacutuli/PROVA/Documents/pdf_fac_simile")
    # cartella radice dove verranno create le sottocartelle di output
    output_root = Path("/storage/data_4T_b/andreacutuli/PROVA/Documents/output_images")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time_global = time.time()

    # Itera su tutti i PDF nella cartella
    for input_doc_path in pdf_folder.glob("*.pdf"):
        if not input_doc_path.exists():
            _log.warning(f"PDF non trovato: {input_doc_path}")
            continue

        # Crea la sottocartella di output per questo file
        doc_filename = input_doc_path.stem
        output_dir = output_root / doc_filename
        output_dir.mkdir(parents=True, exist_ok=True)

        start_time = time.time()

        conv_res = doc_converter.convert(str(input_doc_path))

        # --- Da qui in poi è TUTTO il tuo codice originale, invariato ---

        # Save page images
        pages_obj = getattr(conv_res.document, "pages", None)
        if pages_obj is None:
            _log.warning("Nessuna proprietà pages trovata su conv_res.document")
        else:
            try:
                iterator = list(pages_obj.items())  # dict-like
                is_dict = True
            except Exception:
                iterator = list(enumerate(pages_obj, start=1))  # list-like
                is_dict = False

            for key, page in iterator:
                page_no = getattr(page, "page_no", None) or (key if is_dict else key)
                page_image_filename = output_dir / f"{doc_filename}-page-{page_no}.png"
                try:
                    page_image = getattr(page, "image", None)
                    if page_image is None:
                        _log.debug(f"Nessuna immagine pagina per page {page_no}")
                        continue
                    pil_img = getattr(page_image, "pil_image", None)
                    if pil_img is not None:
                        pil_img.save(page_image_filename, format="PNG")
                    else:
                        page_image.save(str(page_image_filename), format="PNG")
                except Exception as e:
                    _log.exception(f"Errore salvataggio immagine pagina {page_no}: {e}")

        # Save images of figures and tables
        table_counter = 0
        picture_counter = 0
        try:
            iterator = conv_res.document.iterate_items()
        except Exception:
            iterator = []

        for element, _level in iterator:
            try:
                if isinstance(element, TableItem):
                    table_counter += 1
                    element_image_filename = output_dir / f"{doc_filename}-table-{table_counter}.png"
                    img = element.get_image(conv_res.document)
                    if hasattr(img, "save"):
                        img.save(element_image_filename, format="PNG")
                    else:
                        with open(element_image_filename, "wb") as fp:
                            fp.write(img)
                elif isinstance(element, PictureItem):
                    picture_counter += 1
                    element_image_filename = output_dir / f"{doc_filename}-picture-{picture_counter}.png"
                    img = element.get_image(conv_res.document)
                    if hasattr(img, "save"):
                        img.save(element_image_filename, format="PNG")
                    else:
                        with open(element_image_filename, "wb") as fp:
                            fp.write(img)
            except Exception as e:
                _log.exception(f"Errore salvataggio elemento {type(element)}: {e}")

        # Generazione manuale di markdown con segnaposto
        md_lines = []
        table_counter_md = 0
        picture_counter_md = 0

        try:
            iterator = conv_res.document.iterate_items()
        except Exception:
            iterator = []

        for element, _level in iterator:
            try:
                if isinstance(element, TableItem):
                    table_counter_md += 1
                    md_lines.append(f"[[TABLE-{table_counter_md}]]\n")
                elif isinstance(element, PictureItem):
                    picture_counter_md += 1
                    md_lines.append(f"[[IMAGE-{picture_counter_md}]]\n")
                else:
                    testo = getattr(element, "text", "") or getattr(element, "get_text", lambda: "")()
                    if testo:
                        md_lines.append(testo + "\n")
            except Exception as e:
                _log.exception(f"Errore processando elemento {type(element)}: {e}")

        md_placeholder_filename = output_dir / f"{doc_filename}-with-placeholders.md"
        try:
            with open(md_placeholder_filename, "w", encoding="utf-8") as f:
                f.write("\n".join(md_lines))
            print(f"Markdown con segnaposto salvato in: {md_placeholder_filename}")
        except Exception as e:
            _log.exception(f"Errore salvataggio markdown con segnaposto: {e}")

        end_time = time.time() - start_time
        _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
        _log.info(f"Saved pages: {page_no if 'page_no' in locals() else 'n/a'}, tables: {table_counter}, pictures: {picture_counter}")

        for nome_file in os.listdir(output_dir):
            percorso_file = os.path.join(output_dir, nome_file)
            if os.path.isfile(percorso_file) and "page" in nome_file:
                os.remove(percorso_file)
                print(f"Cancellato: {nome_file}")

    total_time = time.time() - start_time_global
    _log.info(f"Tutti i PDF elaborati in {total_time:.2f} secondi.")


if __name__ == "__main__":
    main()


In [None]:
#SPOSTA TUTTI I FILE MARKDOWN IN UNA CARTELLA A PARTE
import os
import shutil

# Cartella sorgente
source_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/output_images"

# Cartella di destinazione
dest_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_placeholders"
os.makedirs(dest_dir, exist_ok=True)

# Itera ricorsivamente tutte le sottocartelle
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if file.lower().endswith(".md"):
            source_path = os.path.join(root, file)
            dest_path = os.path.join(dest_dir, file)

            # Sposta il file (sovrascrive se esiste già)
            shutil.move(source_path, dest_path)
            print(f"Spostato: {source_path} -> {dest_path}")


In [None]:
#PROCESSA TUTTE LE IMMAGINI CON OLLAMA E SALVA LE DESCRIZIONI IN FILE .TXT

import os
import ollama

input_root = "/storage/data_4T_b/andreacutuli/PROVA/Documents/output_images"
descrizioni_root = "/storage/data_4T_b/andreacutuli/PROVA/Documents/descrizioni"
os.makedirs(descrizioni_root, exist_ok=True)

# Modello Ollama
model = "granite3.2-vision"
prompt = "Describe what's in this image."

# Itera sulle sottocartelle nella cartella principale di input
for subfolder in os.listdir(input_root):
    subfolder_path = os.path.join(input_root, subfolder)
    if not os.path.isdir(subfolder_path):
        continue  # salta eventuali file direttamente in input_root

    # Crea la sottocartella corrispondente per le descrizioni
    output_subfolder = os.path.join(descrizioni_root, subfolder)
    os.makedirs(output_subfolder, exist_ok=True)

    # Itera su ogni immagine nella sottocartella
    for nome_file in os.listdir(subfolder_path):
        percorso_file = os.path.join(subfolder_path, nome_file)

        if os.path.isfile(percorso_file) and nome_file.lower().endswith((".png", ".jpg", ".jpeg")):
            print(f"\nProcessing image: {nome_file} in folder {subfolder}")

            # Esegui inferenza con Ollama
            response = ollama.generate(
                model=model,
                prompt=prompt,
                images=[percorso_file]
            )

            descrizione = response['response']
            print("Description:", descrizione)

            # Salva la descrizione in file .txt
            txt_filename = os.path.splitext(nome_file)[0] + ".txt"
            txt_path = os.path.join(output_subfolder, txt_filename)
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(descrizione)
            print(f"[OK] Descrizione salvata in {txt_path}")


# Conversione pdf in markdown

In [None]:
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

# === CONFIG ===
SOURCE_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_placeholders"  # da dove prendere i .md
OUTPUT_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunks"  # dove salvare i .md

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Inizializza chunker ===
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
semantic_chunker = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient"
)

# === Funzione per processare un singolo documento ===
def process_document(file_path, output_folder):
    # Converti in docling document
   with open(file_path, "r", encoding="utf-8") as f:
    doc_markdown = f.read()


    # Crea chunk semantici
    semantic_chunks = semantic_chunker.create_documents([doc_markdown])

    # Salva chunk in file markdown
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_folder, f"{base_name}_chunks.md")

    with open(output_file, "w", encoding="utf-8") as f:
        for i, doc in enumerate(semantic_chunks):
            f.write(f"### Chunk {i + 1}\n\n")
            f.write(doc.page_content.strip() + "\n\n")
    print(f"✅ File salvato: {output_file} ({len(semantic_chunks)} chunk)")

# === Esecuzione per tutti i file della cartella ===
for file_name in os.listdir(SOURCE_FOLDER):
    file_path = os.path.join(SOURCE_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith('.md'):
        process_document(file_path, OUTPUT_FOLDER)

print("\n✅ Tutti i documenti processati.")


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 2-with-placeholders_chunks.md (13 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 21-with-placeholders_chunks.md (3 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 18-with-placeholders_chunks.md (5 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE Validazione-with-placeholders_chunks.md (5 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 14-with-placeholders_chunks.md (8 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 12-with-placeholders_chunks.md (4 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/FAC-SIMILE 17-with-placeholders_chunks.md (6 chunk)
✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Mar

# APPLICAZIONE TABELLE E IMMAGINI DENTRO I PLACEHOLDERS

In [None]:
import os
import re

# cartelle principali
template_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunks"      # markdown con placeholder
content_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/descrizioni"       # cartella con sottocartelle di tabelle/immagini
output_dir = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image" # dove scrivere i risultati

os.makedirs(output_dir, exist_ok=True)

os.makedirs(output_dir, exist_ok=True)

# regex per i placeholder del tipo [[IMAGE-1]] [[TABLE-3]]
placeholder_pattern = re.compile(r"\[\[(IMAGE|TABLE)-(\d+)\]\]")

def find_content_file(base_name, kind, num):
    """
    Cerca ricorsivamente nei contenuti un file che corrisponde al pattern:
    base_name + "-" + tipo + "-" + num + ".txt"
    dove tipo è in minuscolo (table/picture).
    """
    kind_map = {"IMAGE": "picture", "TABLE": "table"}
    expected_name = f"{base_name}-{kind_map[kind]}-{num}.txt"

    for root, _, files in os.walk(content_dir):
        for f in files:
            if f == expected_name:
                return os.path.join(root, f)
    return None

for filename in os.listdir(template_dir):
    if filename.endswith("-with-placeholders_chunks.md"):
        template_path = os.path.join(template_dir, filename)

        # estraggo il "base_name" eliminando "-with-placeholders_chunks.md"
        base_name = filename.replace("-with-placeholders_chunks.md", "")

        with open(template_path, "r", encoding="utf-8") as f:
            content = f.read()

        # cerco i placeholder nel testo
        matches = placeholder_pattern.findall(content)

        for kind, num in matches:
            file_path = find_content_file(base_name, kind, num)
            placeholder = f"[[{kind}-{num}]]"

            if file_path:
                with open(file_path, "r", encoding="utf-8") as f:
                    replacement = f.read()

                # creo il blocco completo
                wrapped = f"{placeholder} START\n{replacement}\n{placeholder} END"
                content = content.replace(placeholder, wrapped)
            else:
                print(f"Manca contenuto per {placeholder} (file base: {base_name})")

        # salvo l'output
        out_path = os.path.join(output_dir, f"{base_name}.md")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Creato {out_path}")


Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 6.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 7.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 2.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 9.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 19.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 1.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 14.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 4.md
Creato /storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image/FAC-SIMILE 15.m

# RETRIEVAL

In [None]:
import os
import chromadb
import requests
import hashlib

# === CONFIG ===
MD_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/markdown_chunk_placeholders_table_image"  # cartella con file .md
#CHROMA_PATH = "./chroma_docs_db"
CHROMA_PATH = os.path.expanduser("~/chroma_docs_db")
os.makedirs(CHROMA_PATH, exist_ok=True)
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api"

# === Setup ChromaDB ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(name=collection_name)

# === FUNZIONI ===
def get_ollama_embedding(text):
    try:
        response = requests.post(
            f"{OLLAMA_URL}/embeddings",
            json={"model": EMBED_MODEL, "prompt": text}
        ).json()
        if "embedding" not in response:
            raise KeyError("Chiave 'embedding' mancante nella risposta di Ollama.")
        return response["embedding"]
    except Exception as e:
        print(f"⚠️ Errore generando embedding: {e}")
        return None

def calculate_file_hash(filepath):
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep it concise.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )
    try:
        response = requests.post(
            f"{OLLAMA_URL}/chat",
            json={
                "model": LLM_MODEL,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "stream": False
            }
        )
        response.raise_for_status()
        return response.json()['message']['content']
    except Exception as e:
        return f"⚠️ Errore durante la generazione della risposta: {e}"

def query_documents(question, n_results=3):
    query_embedding = get_ollama_embedding(question)
    if query_embedding is None:
        return []
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['metadatas', 'documents']
    )
    retrieved = []
    if "documents" in results and results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved.append({"text": doc, "filename": meta.get('filename', 'Sconosciuto')})
    return retrieved

def index_md_file(md_filepath):
    """Indicizza un singolo file Markdown già diviso in chunk ### Chunk, evitando re-indicizzazione se non cambia."""
    with open(md_filepath, "r", encoding="utf-8") as f:
        content = f.read()
    
    file_hash = calculate_file_hash(md_filepath)
    filename = os.path.basename(md_filepath)

    # Controlla se il file è già presente e non è cambiato
    existing_docs = collection.get(where={"filename": filename}, include=["metadatas"])
    if existing_docs and "metadatas" in existing_docs and existing_docs["metadatas"]:
        stored_hashes = {meta.get("file_hash") for meta in existing_docs["metadatas"] if "file_hash" in meta}
        if file_hash in stored_hashes:
            print(f"✔️ '{filename}' non è cambiato. Saltato.")
            return
        else:
            # Elimina vecchi chunk del file se è cambiato
            collection.delete(where={"filename": filename})
            print(f"🔄 '{filename}' modificato. Re-indicizzazione in corso...")

    raw_chunks = [chunk.strip() for chunk in content.split("### Chunk") if chunk.strip()]
    
    for i, chunk_text in enumerate(raw_chunks):
        chunk_id = f"{filename}_chunk{i+1}"
        embedding = get_ollama_embedding(chunk_text)
        if embedding is None:
            print(f"⚠️ Embedding fallito per {chunk_id}")
            continue
        collection.upsert(
            ids=[chunk_id],
            documents=[chunk_text],
            embeddings=[embedding],
            metadatas={"filename": filename, "file_hash": file_hash}
        )
    print(f"✅ Indicizzazione completata per {filename}")

# === ESECUZIONE PRINCIPALE: indicizza tutti i file della cartella ===
for file_name in os.listdir(MD_FOLDER):
    file_path = os.path.join(MD_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith(".md"):
        index_md_file(file_path)

# === QUERY E RISPOSTA ===
question = "pRischi, Guadagni Attesi e Capitale Minimo di Investimento nei vari profili di investimento"
retrieved_results = query_documents(question)

if not retrieved_results:
    print("⚠️ Nessun documento rilevante trovato.")
else:
    relevant_chunks = [res["text"] for res in retrieved_results]
    answer = generate_response(question, relevant_chunks)
    
    print("\n✅ Risposta sintetica:")
    print(answer)
    
    source_files = sorted(list(set(res["filename"] for res in retrieved_results)))
    print("\n📚 Fonti utilizzate:")
    for filename in source_files:
        print(f"- {filename}")


✅ Indicizzazione completata per FAC-SIMILE 12.md
✅ Indicizzazione completata per FAC-SIMILE 4.md
✅ Indicizzazione completata per FAC-SIMILE 18.md
✅ Indicizzazione completata per FAC-SIMILE 21.md
✅ Indicizzazione completata per FAC-SIMILE 15.md
✅ Indicizzazione completata per FAC-SIMILE 20.md
