# RETRIEVAL OLLAMA

## Elimina precedente database per inizializzazione pulita

In [9]:
import shutil
import os


CHROMA_PATH = "chroma_persistent_storage"
shutil.rmtree(CHROMA_PATH, ignore_errors=True)


cartella = CHROMA_PATH

# Verifica che la cartella esista
if os.path.exists(cartella):
    shutil.rmtree(CHROMA_PATH)
    print(f"Cartella '{cartella}' eliminata con successo.")
else:
    print(f"La cartella '{cartella}' non esiste.")


La cartella 'chroma_persistent_storage' non esiste.


In [19]:
import os
import chromadb
import requests
import json
import hashlib

# === CONFIG ===
DOCUMENTS_DIR = r"/storage/data_4T_b/andreacutuli/PROVA/Documents/doc-fac-simile"
CHROMA_PATH = "./chroma_test_db1"
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api"

# === Setup Chroma ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(name=collection_name)

# === FUNZIONI === (Le funzioni fino a qui restano invariate)

def calculate_file_hash(filepath):
    """Calcola l'hash SHA-256 di un file."""
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def split_text(text, chunk_size=1000, chunk_overlap=200):
    """Divide il testo in chunk con sovrapposizione."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

def get_ollama_embedding(text):
    """Genera embedding via Ollama REST API."""
    try:
        response = requests.post(
            f"{OLLAMA_URL}/embeddings",
            json={"model": EMBED_MODEL, "prompt": text}
        ).json()
        if "embedding" not in response:
            raise KeyError("La chiave 'embedding' non è presente nella risposta di Ollama.")
        return response["embedding"]
    except requests.exceptions.RequestException as e:
        print(f"Errore di connessione a Ollama: {e}")
        return None
    except KeyError as e:
        print(f"Errore nella risposta JSON di Ollama: {e}")
        return None

def generate_response(question, relevant_chunks):
    """Genera una risposta usando il modello LLM e i chunk rilevanti."""
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep it concise.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )
    
    try:
        response = requests.post(
            f"{OLLAMA_URL}/chat",
            json={
                "model": LLM_MODEL,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "stream": False
            },
        )
        response.raise_for_status()
        response_data = response.json()
        return response_data['message']['content']
    except requests.exceptions.RequestException as e:
        return f"Errore durante la generazione della risposta: {e}"
    except KeyError:
        return "Errore: la risposta dal modello LLM non ha il formato atteso."

# --- MODIFICA 1: Aggiorniamo query_documents per restituire anche i metadati ---
def query_documents(question, n_results=3):
    """
    Interroga ChromaDB, recuperando sia i chunk di testo che i loro metadati
    per poter identificare il file di origine.
    """
    query_embedding = get_ollama_embedding(question)
    if query_embedding is None:
        return []
    
    results = collection.query(
        query_embeddings=[query_embedding], 
        n_results=n_results,
        include=['metadatas', 'documents']  # Chiediamo esplicitamente metadati e documenti
    )
    
    retrieved_data = []
    if "documents" in results and results["documents"]:
        # Combiniamo i documenti e i metadati in una lista di dizionari
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved_data.append({
                "text": doc,
                "filename": meta.get('filename', 'Sconosciuto') # .get è più sicuro
            })
            
    return retrieved_data

# === INDICIZZAZIONE INCREMENTALE DEI DOCUMENTI === (Invariata)
def index_documents():
    print("Avvio del processo di indicizzazione...")
    existing_docs_metadata = collection.get(include=["metadatas"])
    db_file_hashes = {}
    if existing_docs_metadata and existing_docs_metadata['metadatas']:
        for meta in existing_docs_metadata['metadatas']:
            if 'filename' in meta and 'file_hash' in meta:
                db_file_hashes[meta['filename']] = meta['file_hash']

    disk_files = set(f for f in os.listdir(DOCUMENTS_DIR) if f.endswith(".txt"))
    db_files = set(db_file_hashes.keys())

    for filename in disk_files:
        filepath = os.path.join(DOCUMENTS_DIR, filename)
        current_hash = calculate_file_hash(filepath)
        if filename in db_file_hashes and db_file_hashes[filename] == current_hash:
            print(f"✔️  '{filename}' non è cambiato. Saltato.")
            continue
        if filename in db_file_hashes:
            print(f"🔄 '{filename}' è stato modificato. Re-indicizzazione in corso...")
            collection.delete(where={"filename": filename})
        else:
            print(f"➕ '{filename}' è un nuovo file. Indicizzazione in corso...")
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = split_text(text)
        for i, chunk_text in enumerate(chunks):
            chunk_id = f"{filename}_chunk{i+1}"
            embedding = get_ollama_embedding(chunk_text)
            if embedding:
                collection.upsert(
                    ids=[chunk_id],
                    documents=[chunk_text],
                    embeddings=[embedding],
                    metadatas=[{"filename": filename, "file_hash": current_hash}]
                )
    deleted_files = db_files - disk_files
    if deleted_files:
        for filename in deleted_files:
            print(f"➖ '{filename}' è stato rimosso. Eliminazione dal database...")
            collection.delete(where={"filename": filename})
    print("\nIndicizzazione completata.")






# === ESECUZIONE PRINCIPALE ===

index_documents()


while True:
    question = input("\nInserisci la tua domanda (o 'esci' per terminare): ")
    if question.lower() == 'esci':
        break

    # --- MODIFICA 2: Gestiamo la nuova struttura dei dati recuperati ---
    
    # 1. Recupera la lista di dizionari (testo + nome file)
    retrieved_results = query_documents(question)
    
    if not retrieved_results:
        print("Non sono riuscito a trovare documenti rilevanti per la tua domanda.")
        continue

    # 2. Estrai solo il testo da passare al modello LLM
    relevant_chunks = [res["text"] for res in retrieved_results]


        # Recupera la risposta del modello
    answer = generate_response(question, relevant_chunks)

    # Messaggio fisso per l’elenco dei documenti
    source_msg = "i documenti sono:"

    print("\n✅ Risposta sintetica:")
    print(answer)  # qui mostri la risposta reale

    print(f"\n📚 {source_msg}")
for filename in source_files:
    print(f"- {filename}")

    source_files = sorted(list(set(res["filename"] for res in retrieved_results)))

    print("\n📚 Fonti utilizzate:")
    for filename in source_files:
        print(f"- {filename}")

Avvio del processo di indicizzazione...
✔️  '1 - Copia.txt' non è cambiato. Saltato.
✔️  '1.txt' non è cambiato. Saltato.
✔️  '1 - Copia (8).txt' non è cambiato. Saltato.
✔️  '1 - Copia (9).txt' non è cambiato. Saltato.
✔️  '1 - Copia (2).txt' non è cambiato. Saltato.
✔️  '1 - Copia (4).txt' non è cambiato. Saltato.
✔️  '1 - Copia (3).txt' non è cambiato. Saltato.
✔️  '1 - Copia (7).txt' non è cambiato. Saltato.
✔️  '1 - Copia (6).txt' non è cambiato. Saltato.
✔️  '1 - Copia (5).txt' non è cambiato. Saltato.

Indicizzazione completata.



✅ Risposta sintetica:
 The artificial intelligence (IA) is a significant technological innovation of the 21st century, capable of performing tasks previously considered exclusively human. It finds practical applications in various sectors such as healthcare, finance, transportation, and industry by automating complex processes. However, along with its efficiency, AI raises ethical and social questions like job replacement, algorithmic discrimination, and privacy protection. Research focuses not only on technological innovation but also on developing tools for responsible use and the protection of privacy. In the future, the integration of AI and human intelligence could redefine creativity, critical thinking, and human-machine collaboration entirely.

📚 i documenti sono:


IndexError: list index out of range in query.

In [None]:
all_files = [f for f in os.listdir(DOCUMENTS_DIR) if f.endswith(".txt")]
total_files = len(all_files)
correct_retrievals = 0

for file in all_files:
    query_title = os.path.splitext(file)[0]

    retrieved_results = query_documents(query_title)
    if not retrieved_results:
        continue

    top_files = [res["filename"] for res in retrieved_results[:2]]

    # Controlla se almeno uno dei primi due file corrisponde al titolo della query
    match_found = any(query_title in os.path.splitext(f)[0] for f in top_files)
    if match_found:
        correct_retrievals += 1

# Calcola la percentuale di retrieval corretto
if total_files > 0:
    accuracy = (correct_retrievals / total_files) * 100
    print(f"\n✅ Percentuale di retrieval corretto: {accuracy:.2f}% ({correct_retrievals}/{total_files})")
else:
    print("Nessun file trovato per il test di retrieval.")


# RETRIEVAL CON EMBEDDIZZAZIONE IN PARALLELO

In [None]:
import os
import requests
import json
import hashlib
import concurrent.futures
import chromadb

# === CONFIGURAZIONE ===
DOCUMENTS_DIR = r"C:\Users\user\Desktop\claims"
CHROMA_PATH = "./chroma_persistent_storage"
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api"

# === FUNZIONI DI UTILITÀ ===
def calculate_file_hash(filepath):
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def split_text(text, chunk_size=1000, chunk_overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

def get_ollama_embedding(text):
    try:
        response = requests.post(
            f"{OLLAMA_URL}/embeddings",
            json={"model": EMBED_MODEL, "prompt": text}
        ).json()
        return response.get("embedding")
    except Exception as e:
        print(f"Errore durante la generazione embedding: {e}")
        return None

def generate_response(question, relevant_chunks):
    """Genera risposta usando LLM e chunk rilevanti."""
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep it concise.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )

    try:
        response = requests.post(
            f"{OLLAMA_URL}/chat",
            json={
                "model": LLM_MODEL,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "stream": False
            },
        )
        response.raise_for_status()
        response_data = response.json()
        return response_data['message']['content']
    except requests.exceptions.RequestException as e:
        return f"Errore durante la generazione della risposta: {e}"
    except (KeyError, json.JSONDecodeError):
        return "Errore: la risposta dal modello LLM non ha il formato atteso."

def query_documents(question, collection, n_results=3):
    """Interroga ChromaDB per trovare chunk rilevanti."""
    query_embedding = get_ollama_embedding(question)
    if query_embedding is None:
        return []

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['metadatas', 'documents']
    )

    retrieved_data = []
    if "documents" in results and results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved_data.append({
                "text": doc,
                "filename": meta.get('filename', 'Sconosciuto')
            })
    return retrieved_data

def process_file(filepath, filename):
    """Elaborazione indipendente di un file. Non accede al client Chroma."""
    try:
        current_hash = calculate_file_hash(filepath)
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        chunks = split_text(text)
        ids, documents, embeddings, metadatas = [], [], [], []

        for i, chunk_text in enumerate(chunks):
            embedding = get_ollama_embedding(chunk_text)
            if embedding:
                ids.append(f"{filename}_chunk{i+1}")
                documents.append(chunk_text)
                embeddings.append(embedding)
                metadatas.append({"filename": filename, "file_hash": current_hash})

        return (ids, documents, embeddings, metadatas) if ids else None
    except Exception as e:
        print(f"❌ Errore irreversibile nel file '{filename}': {e}")
        return None

# === FUNZIONE PRINCIPALE DI INDICIZZAZIONE ===
def index_documents():
    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = chroma_client.get_or_create_collection(name="document_qa_collection")

    existing_docs_metadata = collection.get(include=["metadatas"])
    db_file_hashes = {
        meta['filename']: meta['file_hash']
        for meta in existing_docs_metadata.get('metadatas', [])
        if 'filename' in meta and 'file_hash' in meta
    }

    disk_files = {f for f in os.listdir(DOCUMENTS_DIR) if f.endswith(".txt")}
    db_files = set(db_file_hashes.keys())

    files_to_process = []
    for filename in disk_files:
        filepath = os.path.join(DOCUMENTS_DIR, filename)
        current_hash = calculate_file_hash(filepath)
        if db_file_hashes.get(filename) == current_hash:
            print(f"✔️  '{filename}' non è cambiato. Saltato.")
            continue
        if filename in db_file_hashes:
            print(f"🔄 '{filename}' modificato. Re-indicizzazione.")
            collection.delete(where={"filename": filename})
        else:
            print(f"➕ '{filename}' è nuovo. Indicizzazione.")
        files_to_process.append((filepath, filename))

    all_ids, all_documents, all_embeddings, all_metadatas = [], [], [], []

    if files_to_process:
        print(f"\n🚀 Elaborazione parallela di {len(files_to_process)} file con ThreadPoolExecutor...")
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            future_to_file = {executor.submit(process_file, fp, fn): fn for fp, fn in files_to_process}
            for future in concurrent.futures.as_completed(future_to_file):
                filename = future_to_file[future]
                try:
                    result = future.result()
                    if result:
                        ids, documents, embeddings, metadatas = result
                        all_ids.extend(ids)
                        all_documents.extend(documents)
                        all_embeddings.extend(embeddings)
                        all_metadatas.extend(metadatas)
                except Exception as exc:
                    print(f"❌ Eccezione grave durante l'elaborazione di '{filename}': {exc}")

        if all_ids:
            print(f"\n✅ Inserimento in batch di {len(all_ids)} chunk nel database...")
            collection.upsert(
                ids=all_ids,
                documents=all_documents,
                embeddings=all_embeddings,
                metadatas=all_metadatas
            )
        else:
            print("\n⚠️ Nessun nuovo chunk da inserire nel database.")

    deleted_files = db_files - disk_files
    for filename in deleted_files:
        print(f"➖ '{filename}' rimosso dal disco. Eliminazione dal database...")
        collection.delete(where={"filename": filename})

    print("\nIndicizzazione completata.")

# === ESECUZIONE PRINCIPALE ===
if __name__ == "__main__":
    index_documents()

    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = chroma_client.get_or_create_collection(name="document_qa_collection")

    while True:
        question = input("\nInserisci la tua domanda (o 'esci' per terminare): ")
        if question.lower() == 'esci':
            break

        retrieved_results = query_documents(question, collection)
        if not retrieved_results:
            print("Non sono riuscito a trovare documenti rilevanti.")
            continue

        relevant_chunks = [res["text"] for res in retrieved_results]
        answer = generate_response(question, relevant_chunks)

        source_files = sorted(list(set(res["filename"] for res in retrieved_results)))

        print("\n✅ Risposta sintetica:")
        print(answer)

        print("\n📚 Fonti utilizzate:")
        for filename in source_files:
            print(f"- {filename}")


In [None]:
def evaluate_retrieval(collection, top_k=2):
    all_files = [f for f in os.listdir(DOCUMENTS_DIR) if f.endswith(".txt")]
    total_files = len(all_files)
    correct_retrievals = 0

    for file in all_files:
        query_title = os.path.splitext(file)[0]

        retrieved_results = query_documents(query_title, collection, n_results=top_k)
        if not retrieved_results:
            continue

        top_files = [res["filename"] for res in retrieved_results[:top_k]]

        match_found = any(query_title in os.path.splitext(f)[0] for f in top_files)
        if match_found:
            correct_retrievals += 1

    if total_files > 0:
        accuracy = (correct_retrievals / total_files) * 100
        print(f" accuracy = {accuracy:.2f}%")
    else:
        print("0.00")


import sys, io

if __name__ == "__main__":
    # silenzia i print di index_documents
    sys_stdout_backup = sys.stdout
    sys.stdout = io.StringIO()
    index_documents()
    sys.stdout = sys_stdout_backup

    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = chroma_client.get_or_create_collection(name="document_qa_collection")

    evaluate_retrieval(collection, top_k=2)


# DIVISIONE DOCUMENTO IN CHUNK E SINTESI DI OGNI CHUNK

In [None]:
import os
import requests
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
import chromadb

#CONFIGURAZIONE
DOCUMENTS_DIR = r"C:\Users\user\Desktop\claims"
OLLAMA_URL = "http://localhost:11434/api"
LLM_MODEL = "mistral"              
EMBED_MODEL = "nomic-embed-text"   
HF_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  

CHROMA_PATH = "./chroma_persistent_storage1"

# MODELLI EMBEDDING E CHUNKER SEMANTICO
embedding_model_hf = HuggingFaceEmbeddings(model_name=HF_MODEL)
semantic_chunker = SemanticChunker(
    embeddings=embedding_model_hf,
    breakpoint_threshold_type="percentile"
)

chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(name="chunk_summaries")

# FUNZIONI 
def summarize_with_ollama(text):
    prompt = f"Riepiloga in modo conciso il seguente testo:\n\n{text}"
    response = requests.post(
        f"{OLLAMA_URL}/chat",
        json={
            "model": LLM_MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False
        }
    )
    response.raise_for_status()
    return response.json()["message"]["content"].strip()

def embed_with_ollama(text):
    response = requests.post(
        f"{OLLAMA_URL}/embeddings",
        json={"model": EMBED_MODEL, "prompt": text}
    )
    response.raise_for_status()
    return response.json()["embedding"]

#FUNZIONE PROCESSA DOCUMENTI
def process_documents_one_by_one():
    for filename in os.listdir(DOCUMENTS_DIR):
        if not filename.endswith(".txt"):
            continue

        filepath = os.path.join(DOCUMENTS_DIR, filename)
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        chunks = semantic_chunker.split_text(text)
        print(f"{filename}: {len(chunks)} chunk trovati")

        ids = []
        documents = []
        metadatas = []
        embeddings = []

        for i, chunk in enumerate(chunks, start=1):
            summary = summarize_with_ollama(chunk)
            embedding = embed_with_ollama(summary)

            ids.append(f"{filename}_chunk{i}")
            documents.append(summary)
            metadatas.append({
                "filename": filename,
                "filepath": filepath  # aggiunto il percorso completo
            })
            embeddings.append(embedding)

        # Salva chunk del singolo documento in ChromaDB
        collection.upsert(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings
        )
        print(f"{filename} salvato in ChromaDB con percorso.")



if __name__ == "__main__":
    process_documents_one_by_one()
    print("Tutti i documenti processati e salvati.")



Are_any_of_the_border_states_covered_by_the_Ninth_Circuit_Court_of_Appeals.txt: 20 chunk trovati
Are_any_of_the_border_states_covered_by_the_Ninth_Circuit_Court_of_Appeals.txt salvato in ChromaDB con percorso.
Are_green_house_gasses_what_cause_holes_in_the_ozone_layer.txt: 53 chunk trovati
Are_green_house_gasses_what_cause_holes_in_the_ozone_layer.txt salvato in ChromaDB con percorso.
Are_there_any_circumstances_where_exemptions_of_mandatory_vaccinations_in_those_provinces_for_studen.txt: 97 chunk trovati
Are_there_any_circumstances_where_exemptions_of_mandatory_vaccinations_in_those_provinces_for_studen.txt salvato in ChromaDB con percorso.
Did_Barack_Obama_write_any_Autobiographies_before_2019.txt: 26 chunk trovati
Did_Barack_Obama_write_any_Autobiographies_before_2019.txt salvato in ChromaDB con percorso.
Did_Hunter_Biden_have_any_experience_in_the_energy_sector_at_the_time_he_joined_the_board_of_the__Bu.txt: 89 chunk trovati
Did_Hunter_Biden_have_any_experience_in_the_energy_sector

KeyboardInterrupt: 

In [12]:
result = collection.get(limit=1, include=["documents", "metadatas", "embeddings"])

print("ID:", result["ids"][0])
print("Sintesi:", result["documents"][0])
print("Metadati:", result["metadatas"][0])
print("Embedding (prime 10 dimensioni):", result["embeddings"][0][:10])


ID: Are_green_house_gasses_what_cause_holes_in_the_ozone_layer.txt_chunk1
Sintesi: The text you provided is a summary of information about greenhouse gases, their sources, effects, and potential solutions to reduce emissions. Here are some key points:

1. Greenhouse gases (GHGs) are gases in Earth's atmosphere that trap heat and warm the planet, leading to the greenhouse effect. Examples include carbon dioxide (CO2), methane, nitrous oxide, and fluorinated gases.

2. Human activities, such as burning fossil fuels and deforestation, have significantly increased GHG emissions in the industrial era. These activities contribute to global warming and climate change.

3. The greenhouse effect is responsible for making Earth's overall temperature higher than it would be without these gases. The term "greenhouse" was first applied to this phenomenon by Nils Gustaf Ekholm in 1901.

4. Most GHGs have both natural and human-caused sources, with the exception of synthetic halocarbons which have no

# Retrieval sul testo sintetizzato

In [None]:
import requests
import chromadb

#CONFIGURAZIONE 
CHROMA_PATH = "./chroma_persistent_storage1"
OLLAMA_URL = "http://localhost:11434/api"
LLM_MODEL = "mistral"
TOP_K = 3  # numero di chunk da recuperare

#Inizializzazone ChromaDB
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(name="chunk_summaries")

#ChromaDB query
def query_chroma(question, top_k=TOP_K):
    # recupera embedding della domanda
    response = requests.post(
        f"{OLLAMA_URL}/embeddings",
        json={"model": "nomic-embed-text", "prompt": question}
    )
    response.raise_for_status()
    query_embedding = response.json()["embedding"]

    # cerca chunk più rilevanti
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas"]
    )

    retrieved_chunks = []
    if results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved_chunks.append({"text": doc, "filename": meta.get("filename", "")})
    return retrieved_chunks


'''

# === Funzione per generare risposta con Mistral ===
def generate_answer(question, retrieved_chunks):
    context = "\n\n".join([c["text"] for c in retrieved_chunks])
    prompt = (
        f"Usa i seguenti contesti per rispondere brevemente alla domanda in italiano.\n\n"
        f"Contesto:\n{context}\n\nDomanda:\n{question}"
    )

    response = requests.post(
        f"{OLLAMA_URL}/chat",
        json={
            "model": LLM_MODEL,
            "messages": [
                {"role": "system", "content": "Sei un assistente utile."},
                {"role": "user", "content": prompt}
            ],
            "stream": False
        }
    )
    response.raise_for_status()
    return response.json()["message"]["content"]

    '''

# GENERARE RETRIEVAL CON MISTRAL
if __name__ == "__main__":
    while True:
        question = input("\nInserisci la tua domanda (o 'esci' per terminare): ")
        if question.lower() == "esci":
            break

        chunks = query_chroma(question)
        if not chunks:
            print("Nessun chunk rilevante trovato.")
            continue

        #answer = generate_answer(question, chunks)
        #print("\n✅ Risposta:")
        #print(answer)

        print("\n📚 Fonti utilizzate:")
        for c in chunks:
            print(f"- {c['filename']}")


'''
idee da implementare: hash per evitare rielaborazione documenti non cambiati
'''





Inserisci la tua domanda (o 'esci' per terminare):  hunter biden



✅ Risposta:
 Hunter Biden is an American businessman, lawyer, and the son of U.S. President Joe Biden. Born on February 4, 1970, he has worked in various fields such as investing, lobbying, and philanthropy. He has been involved in several controversies, including investigations and federal indictments for firearms trials, tax indictment, guilty pleas, pardon of criminal offenses, laptop files, Navy Reserve issues, and litigation. In his personal life, he has had relationships and struggled with drug and alcohol abuse. He is known in popular culture for his role in the Ukraine conspiracy theory, Department of Justice investigation, laptop controversy, and the book "Beautiful Things" about his struggles with addiction.

📚 Fonti utilizzate:
- Did_Hunter_Biden_have_any_experience_in_Ukraine_at_the_time_he_joined_the_board_of_the__Burisma_ener.txt
- Did_Hunter_Biden_have_any_experience_in_Ukraine_at_the_time_he_joined_the_board_of_the__Burisma_ener.txt
- Did_Hunter_Biden_have_any_experien


Inserisci la tua domanda (o 'esci' per terminare):  Did_Barack_Obama_write_any_Autobiographies_before_2019?



✅ Risposta:
 No, Barack Obama did not write any autobiographies before 2019, according to the provided bibliography. The only autobiography mentioned in the context is by Sasha Abramsky, published in 2009, which is not written by Obama himself.

📚 Fonti utilizzate:
- Did_Barack_Obama_write_any_Autobiographies_before_2019.txt
- Did_Barack_Obama_write_any_Autobiographies_before_2019.txt
- Did_Barack_Obama_write_any_Autobiographies_before_2019.txt



Inserisci la tua domanda (o 'esci' per terminare):  esci


In [17]:
import os
import requests
import chromadb

# === CONFIG ===
DOCUMENTS_DIR = r"C:\Users\user\Desktop\claims"
CHROMA_PATH = "./chroma_persistent_storage1"
OLLAMA_URL = "http://localhost:11434/api"
EMBED_MODEL = "nomic-embed-text"
TOP_K = 3

# === Inizializza ChromaDB ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(name="chunk_summaries")

# === Funzione per interrogare ChromaDB ===
def query_chroma(question, top_k=TOP_K):
    response = requests.post(
        f"{OLLAMA_URL}/embeddings",
        json={"model": EMBED_MODEL, "prompt": question}
    )
    response.raise_for_status()
    query_embedding = response.json()["embedding"]

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas"]
    )

    retrieved_chunks = []
    if results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved_chunks.append({"text": doc, "filename": meta.get("filename", "")})
    return retrieved_chunks

# === Funzione di valutazione retrieval ===
def evaluate_retrieval(collection, top_k=TOP_K):
    all_files = [f for f in os.listdir(DOCUMENTS_DIR) if f.endswith(".txt")]
    total_files = len(all_files)
    correct_retrievals = 0

    for file in all_files:
        query_title = os.path.splitext(file)[0]  # nome file senza estensione

        retrieved_results = query_chroma(query_title, top_k=top_k)
        if not retrieved_results:
            continue

        # prendi i top-k nomi file dei chunk recuperati
        top_files = [res["filename"] for res in retrieved_results[:top_k]]

        # match se almeno uno contiene il titolo
        match_found = any(query_title in os.path.splitext(f)[0] for f in top_files)
        if match_found:
            correct_retrievals += 1

    if total_files > 0:
        accuracy = (correct_retrievals / total_files) * 100
        print(f"\n📊 Retrieval accuracy = {accuracy:.2f}% "
              f"({correct_retrievals}/{total_files})")
    else:
        print("Nessun file trovato per valutazione.")

# === Main ===
if __name__ == "__main__":
    evaluate_retrieval(collection, top_k=TOP_K)



📊 Retrieval accuracy = 21.00% (21/100)


In [20]:
import os
import requests
import chromadb

# === CONFIG ===
CHROMA_PATH = "./chroma_persistent_storage1"
OLLAMA_URL = "http://localhost:11434/api"
TOP_K = 2

# === Inizializza ChromaDB ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(name="chunk_summaries")

# === Funzione query ===
def query_chroma(question, top_k=TOP_K):
    response = requests.post(
        f"{OLLAMA_URL}/embeddings",
        json={"model": "nomic-embed-text", "prompt": question}
    )
    response.raise_for_status()
    query_embedding = response.json()["embedding"]

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas"]
    )

    retrieved_chunks = []
    if results["documents"]:
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            retrieved_chunks.append({"text": doc, "filename": meta.get("filename", "")})
    return retrieved_chunks

# === Funzione di valutazione ===
def evaluate_retrieval(questions, collection, top_k=2):
    total = len(questions)
    correct = 0

    for q in questions:
        retrieved = query_chroma(q, top_k=top_k)
        if not retrieved:
            continue

        top_files = [res["filename"] for res in retrieved[:top_k]]
        # match se il nome della domanda è contenuto nel filename
        match_found = any(q in os.path.splitext(f)[0] for f in top_files)
        if match_found:
            correct += 1

    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"📊 Retrieval accuracy = {accuracy:.2f}% ({correct}/{total})")

# === Lista delle domande ===
questions = [
    "Are_any_of_the_border_states_covered_by_the_Ninth_Circuit_Court_of_Appeals",
    "Are_green_house_gasses_what_cause_holes_in_the_ozone_layer",
    "Are_there_any_circumstances_where_exemptions_of_mandatory_vaccinations_in_those_provinces_for_studen",
    "Did_Barack_Obama_write_any_Autobiographies_before_2019",
    "Did_Hunter_Biden_have_any_experience_in_the_energy_sector_at_the_time_he_joined_the_board_of_the__Bu",
    "Did_Hunter_Biden_have_any_experience_in_Ukraine_at_the_time_he_joined_the_board_of_the__Burisma_ener",
    "Did_Kenya_build_11200_kilometres_of_tarmacked_roads_in_the_50_years_post_independence",
    "Did_Nancy_Green_have_any_other_jobs_after_her_role_as_Aunt_Jemima",
    "Did_Sen_Bernie_Sanders_have_a_job_before_age_53",
    "Did_the_Democrats_and_the_Deep_state_do_anything_to_create_the_narcotics_epidemic_in_the_USA",
    "Does_Adam_Schiff_have_siblings",
    "Does_Cherie_Blair_deal_with_immigration_law",
    "Does_USA_have_a_subsidy_system_for_wind_turbines",
    "Do_foreign_governments_gets_to_pick_lottery_applicants",
    "Has_Amy_Klobuchar_won_every_election_she_has_been_in",
    "Has_Elizabeth_Warren_won_every_election_she_has_been_in",
    "Has_the_motto_of_the_Supreme_Court_of_India_been_changed",
    "How_is_the_nightly_pledge_of_allegiance_recited",
    "How_long_did_Nancy_Green_portray_Aunt_Jemima_",
    "How_long_did_the_1968_Flu_pandemic_last",
    "How_many_mosques_are_in_Bangaluru_Bangalore",
    "How_old_was_the_bridge_when_it_collapsed"
]

if __name__ == "__main__":
    evaluate_retrieval(questions, collection, top_k=2)


📊 Retrieval accuracy = 90.91% (20/22)
