In [17]:
import os
from docling.document_converter import DocumentConverter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

# === CONFIG ===
SOURCE_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/pdf_doc-fac-simile"  # cartella con PDF/Word/Doc
OUTPUT_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks"  # dove salvare i .md

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Inizializza converter e chunker ===
converter = DocumentConverter()
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
semantic_chunker = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient"
)

# === Funzione per processare un singolo documento ===
def process_document(file_path, output_folder):
    # Converti in docling document
    result = converter.convert(file_path)
    doc_markdown = result.document.export_to_markdown()

    # Crea chunk semantici
    semantic_chunks = semantic_chunker.create_documents([doc_markdown])

    # Salva chunk in file markdown
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_folder, f"{base_name}_chunks.md")

    with open(output_file, "w", encoding="utf-8") as f:
        for i, doc in enumerate(semantic_chunks):
            f.write(f"### Chunk {i + 1}\n\n")
            f.write(doc.page_content.strip() + "\n\n")
    print(f"✅ File salvato: {output_file} ({len(semantic_chunks)} chunk)")

# === Esecuzione per tutti i file della cartella ===
for file_name in os.listdir(SOURCE_FOLDER):
    file_path = os.path.join(SOURCE_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith(('.pdf', '.docx', '.doc')):
        process_document(file_path, OUTPUT_FOLDER)

print("\n✅ Tutti i documenti processati.")


2025-09-30 19:06:52,184 - INFO - Use pytorch device_name: cuda:0
2025-09-30 19:06:52,185 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large


2025-09-30 19:06:58,465 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:06:58,469 - INFO - Going to convert document batch...
2025-09-30 19:06:58,470 - INFO - Initializing pipeline for StandardPdfPipeline with options hash d291d1f79894f05d312cc90dd3fdf3d3
2025-09-30 19:06:58,470 - INFO - Accelerator device: 'cuda:0'
2025-09-30 19:07:02,698 - INFO - Accelerator device: 'cuda:0'
2025-09-30 19:07:05,077 - INFO - Accelerator device: 'cuda:0'
2025-09-30 19:07:06,960 - INFO - Processing document 1 - Copia (2).pdf
2025-09-30 19:07:07,510 - INFO - Finished converting document 1 - Copia (2).pdf in 9.05 sec.
2025-09-30 19:07:07,882 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:07,884 - INFO - Going to convert document batch...
2025-09-30 19:07:07,885 - INFO - Processing document 1 - Copia (4).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (2)_chunks.md (2 chunk)


2025-09-30 19:07:08,427 - INFO - Finished converting document 1 - Copia (4).pdf in 0.55 sec.
2025-09-30 19:07:08,800 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:08,802 - INFO - Going to convert document batch...
2025-09-30 19:07:08,803 - INFO - Processing document 1 - Copia (8).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (4)_chunks.md (2 chunk)


2025-09-30 19:07:09,336 - INFO - Finished converting document 1 - Copia (8).pdf in 0.54 sec.
2025-09-30 19:07:09,720 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:09,721 - INFO - Going to convert document batch...
2025-09-30 19:07:09,722 - INFO - Processing document 1 - Copia (5).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (8)_chunks.md (2 chunk)


2025-09-30 19:07:10,254 - INFO - Finished converting document 1 - Copia (5).pdf in 0.54 sec.
2025-09-30 19:07:10,638 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:10,640 - INFO - Going to convert document batch...
2025-09-30 19:07:10,641 - INFO - Processing document 1.pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (5)_chunks.md (2 chunk)


2025-09-30 19:07:11,196 - INFO - Finished converting document 1.pdf in 0.56 sec.
2025-09-30 19:07:11,396 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:11,398 - INFO - Going to convert document batch...
2025-09-30 19:07:11,399 - INFO - Processing document 1 - Copia (7).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1_chunks.md (2 chunk)


2025-09-30 19:07:11,939 - INFO - Finished converting document 1 - Copia (7).pdf in 0.54 sec.
2025-09-30 19:07:12,144 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:12,145 - INFO - Going to convert document batch...
2025-09-30 19:07:12,146 - INFO - Processing document 1 - Copia (6).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (7)_chunks.md (2 chunk)


2025-09-30 19:07:13,504 - INFO - Finished converting document 1 - Copia (6).pdf in 1.36 sec.
2025-09-30 19:07:13,893 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:13,894 - INFO - Going to convert document batch...
2025-09-30 19:07:13,895 - INFO - Processing document 1 - Copia (3).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (6)_chunks.md (2 chunk)


2025-09-30 19:07:14,409 - INFO - Finished converting document 1 - Copia (3).pdf in 0.52 sec.
2025-09-30 19:07:14,782 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:14,784 - INFO - Going to convert document batch...
2025-09-30 19:07:14,785 - INFO - Processing document 1 - Copia (9).pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (3)_chunks.md (2 chunk)


2025-09-30 19:07:15,327 - INFO - Finished converting document 1 - Copia (9).pdf in 0.55 sec.
2025-09-30 19:07:15,528 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 19:07:15,530 - INFO - Going to convert document batch...
2025-09-30 19:07:15,531 - INFO - Processing document 1 - Copia.pdf


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia (9)_chunks.md (2 chunk)


2025-09-30 19:07:16,065 - INFO - Finished converting document 1 - Copia.pdf in 0.54 sec.


✅ File salvato: /storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks/1 - Copia_chunks.md (2 chunk)

✅ Tutti i documenti processati.


In [None]:
import os
import chromadb
import requests
import hashlib

# === CONFIG ===
MD_FOLDER = r"/storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks"  # cartella con file .md
CHROMA_PATH = "./chroma_test_db1"
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api"

# === Setup ChromaDB ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(name=collection_name)

# === FUNZIONI ===
def get_ollama_embedding(text):
    try:
        response = requests.post(
            f"{OLLAMA_URL}/embeddings",
            json={"model": EMBED_MODEL, "prompt": text}
        ).json()
        if "embedding" not in response:
            raise KeyError("Chiave 'embedding' mancante nella risposta di Ollama.")
        return response["embedding"]
    except Exception as e:
        print(f"⚠️ Errore generando embedding: {e}")
        return None

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep it concise.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )
    try:
        response = requests.post(
            f"{OLLAMA_URL}/chat",
            json={
                "model": LLM_MODEL,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "stream": False
            }
        )
        response.raise_for_status()
        return response.json()['message']['content']
    except Exception as e:
        return f"⚠️ Errore durante la generazione della risposta: {e}"

def query_documents(question, n_results=3):
    query_embedding = get_ollama_embedding(question)
    if query_embedding is None:
        return []
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['metadatas', 'documents']
    )
    retrieved = []
    if "documents" in results and results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved.append({"text": doc, "filename": meta.get('filename', 'Sconosciuto')})
    return retrieved

def index_md_file(md_filepath):
    """Indicizza un singolo file Markdown già diviso in chunk ### Chunk."""
    with open(md_filepath, "r", encoding="utf-8") as f:
        content = f.read()
    
    raw_chunks = [chunk.strip() for chunk in content.split("### Chunk") if chunk.strip()]
    filename = os.path.basename(md_filepath)
    
    for i, chunk_text in enumerate(raw_chunks):
        chunk_id = f"{filename}_chunk{i+1}"
        embedding = get_ollama_embedding(chunk_text)
        if embedding is None:
            print(f"⚠️ Embedding fallito per {chunk_id}")
            continue
        collection.upsert(
            ids=[chunk_id],
            documents=[chunk_text],
            embeddings=[embedding],
            metadatas={"filename": filename}
        )
    print(f"✅ Indicizzazione completata per {filename}")

# === ESECUZIONE PRINCIPALE: indicizza tutti i file della cartella ===
for file_name in os.listdir(MD_FOLDER):
    file_path = os.path.join(MD_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith(".md"):
        index_md_file(file_path)

# === QUERY E RISPOSTA ===
question = "parlami del riscaldamento globale"
retrieved_results = query_documents(question)

if not retrieved_results:
    print("⚠️ Nessun documento rilevante trovato.")
else:
    relevant_chunks = [res["text"] for res in retrieved_results]
    answer = generate_response(question, relevant_chunks)
    
    print("\n✅ Risposta sintetica:")
    print(answer)
    
    source_files = sorted(list(set(res["filename"] for res in retrieved_results)))
    print("\n📚 Fonti utilizzate:")
    for filename in source_files:
        print(f"- {filename}")


✅ Indicizzazione completata per 1 - Copia (4)_chunks.md
✅ Indicizzazione completata per 1 - Copia_chunks.md
✅ Indicizzazione completata per 1 - Copia (5)_chunks.md
✅ Indicizzazione completata per 1 - Copia (3)_chunks.md
✅ Indicizzazione completata per 1 - Copia (6)_chunks.md
✅ Indicizzazione completata per 1_chunks.md
✅ Indicizzazione completata per 1 - Copia (2)_chunks.md
✅ Indicizzazione completata per 1 - Copia (8)_chunks.md
✅ Indicizzazione completata per 1 - Copia (9)_chunks.md
✅ Indicizzazione completata per 1 - Copia (7)_chunks.md
