In [None]:
import os
from docling.document_converter import DocumentConverter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

# === CONFIG ===
SOURCE_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/pdf_doc-fac-simile"  # cartella con PDF/Word/Doc
OUTPUT_FOLDER = "/storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks"  # dove salvare i .md

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Inizializza converter e chunker ===
converter = DocumentConverter()
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
semantic_chunker = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient"
)

# === Funzione per processare un singolo documento ===
def process_document(file_path, output_folder):
    # Converti in docling document
    result = converter.convert(file_path)
    doc_markdown = result.document.export_to_markdown()

    # Crea chunk semantici
    semantic_chunks = semantic_chunker.create_documents([doc_markdown])

    # Salva chunk in file markdown
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_folder, f"{base_name}_chunks.md")

    with open(output_file, "w", encoding="utf-8") as f:
        for i, doc in enumerate(semantic_chunks):
            f.write(f"### Chunk {i + 1}\n\n")
            f.write(doc.page_content.strip() + "\n\n")
    print(f"✅ File salvato: {output_file} ({len(semantic_chunks)} chunk)")

# === Esecuzione per tutti i file della cartella ===
for file_name in os.listdir(SOURCE_FOLDER):
    file_path = os.path.join(SOURCE_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith(('.pdf', '.docx', '.doc')):
        process_document(file_path, OUTPUT_FOLDER)

print("\n✅ Tutti i documenti processati.")


In [33]:
import os
import chromadb
import requests
import hashlib

# === CONFIG ===
MD_FOLDER = r"/storage/data_4T_b/andreacutuli/PROVA/Documents/Markdown_chunks"  # cartella con file .md
#CHROMA_PATH = "./chroma_test_db"
CHROMA_PATH = os.path.expanduser("~/chroma_test_db")
os.makedirs(CHROMA_PATH, exist_ok=True)
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api"

# === Setup ChromaDB ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(name=collection_name)

# === FUNZIONI ===
def get_ollama_embedding(text):
    try:
        response = requests.post(
            f"{OLLAMA_URL}/embeddings",
            json={"model": EMBED_MODEL, "prompt": text}
        ).json()
        if "embedding" not in response:
            raise KeyError("Chiave 'embedding' mancante nella risposta di Ollama.")
        return response["embedding"]
    except Exception as e:
        print(f"⚠️ Errore generando embedding: {e}")
        return None

def calculate_file_hash(filepath):
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep it concise.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )
    try:
        response = requests.post(
            f"{OLLAMA_URL}/chat",
            json={
                "model": LLM_MODEL,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "stream": False
            }
        )
        response.raise_for_status()
        return response.json()['message']['content']
    except Exception as e:
        return f"⚠️ Errore durante la generazione della risposta: {e}"

def query_documents(question, n_results=3):
    query_embedding = get_ollama_embedding(question)
    if query_embedding is None:
        return []
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['metadatas', 'documents']
    )
    retrieved = []
    if "documents" in results and results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved.append({"text": doc, "filename": meta.get('filename', 'Sconosciuto')})
    return retrieved

def index_md_file(md_filepath):
    """Indicizza un singolo file Markdown già diviso in chunk ### Chunk, evitando re-indicizzazione se non cambia."""
    with open(md_filepath, "r", encoding="utf-8") as f:
        content = f.read()
    
    file_hash = calculate_file_hash(md_filepath)
    filename = os.path.basename(md_filepath)

    # Controlla se il file è già presente e non è cambiato
    existing_docs = collection.get(where={"filename": filename}, include=["metadatas"])
    if existing_docs and "metadatas" in existing_docs and existing_docs["metadatas"]:
        stored_hashes = {meta.get("file_hash") for meta in existing_docs["metadatas"] if "file_hash" in meta}
        if file_hash in stored_hashes:
            print(f"✔️ '{filename}' non è cambiato. Saltato.")
            return
        else:
            # Elimina vecchi chunk del file se è cambiato
            collection.delete(where={"filename": filename})
            print(f"🔄 '{filename}' modificato. Re-indicizzazione in corso...")

    raw_chunks = [chunk.strip() for chunk in content.split("### Chunk") if chunk.strip()]
    
    for i, chunk_text in enumerate(raw_chunks):
        chunk_id = f"{filename}_chunk{i+1}"
        embedding = get_ollama_embedding(chunk_text)
        if embedding is None:
            print(f"⚠️ Embedding fallito per {chunk_id}")
            continue
        collection.upsert(
            ids=[chunk_id],
            documents=[chunk_text],
            embeddings=[embedding],
            metadatas={"filename": filename, "file_hash": file_hash}
        )
    print(f"✅ Indicizzazione completata per {filename}")

# === ESECUZIONE PRINCIPALE: indicizza tutti i file della cartella ===
for file_name in os.listdir(MD_FOLDER):
    file_path = os.path.join(MD_FOLDER, file_name)
    if os.path.isfile(file_path) and file_name.lower().endswith(".md"):
        index_md_file(file_path)

# === QUERY E RISPOSTA ===
question = "parlami del riscaldamento globale"
retrieved_results = query_documents(question)

if not retrieved_results:
    print("⚠️ Nessun documento rilevante trovato.")
else:
    relevant_chunks = [res["text"] for res in retrieved_results]
    answer = generate_response(question, relevant_chunks)
    
    print("\n✅ Risposta sintetica:")
    print(answer)
    
    source_files = sorted(list(set(res["filename"] for res in retrieved_results)))
    print("\n📚 Fonti utilizzate:")
    for filename in source_files:
        print(f"- {filename}")


✔️ '1 - Copia (4)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (5)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (3)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (6)_chunks.md' non è cambiato. Saltato.
✔️ '1_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (2)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (8)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (9)_chunks.md' non è cambiato. Saltato.
✔️ '1 - Copia (7)_chunks.md' non è cambiato. Saltato.

✅ Risposta sintetica:
 The rising global temperatures, primarily due to human-induced greenhouse gas emissions, have already produced significant impacts on ecosystems and societies. Rapid retreat of glaciers, rising sea levels, and increasingly frequent extreme weather events such as hurricanes, droughts, and floods are among the alterations with direct implications on agriculture, water supply, and biodiversity. Apart from environmental damage, climate change also brin