In [1]:
# Cell 0: RAG Initialization (Run First)
# -------------------------------------
import os, torch, sys
import minirag
from transformers import AutoTokenizer, AutoModel
from minirag.llm.hf import hf_embed
from minirag.utils import EmbeddingFunc
from minirag.llm import ollama
from minirag.llm.openai import openai_complete, openai_queue_completion
from minirag import MiniRAG
from tqdm.auto import tqdm
import dotenv
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "qwen2.5:latest"  # must match the model available to Ollama on host
WORKING_DIR = r"C:\Users\Francisco Azeredo\Downloads\gerador_documentos_gpt_azure (1)\gerador_documentos_gpt_azure\storage"
LOG_LEVEL = "CRITICAL"

os.makedirs(WORKING_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Init device:", device)

print("Loading embedding tokenizer/model...")
_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
_embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(device)
_embed_model.eval()

async def _embed_batch(texts: list[str]):
    return await hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model)

async def _embed_dispatch(input_text):
    if isinstance(input_text, str):
        return (await _embed_batch([input_text]))[0]


        
    if isinstance(input_text, (list, tuple)) and all(isinstance(t, str) for t in input_text):
        return await _embed_batch(list(input_text))
    raise TypeError(f"Unsupported input type for embedding_func: {type(input_text)}")

_embedding_func = EmbeddingFunc(
    embedding_dim=_embed_model.config.hidden_size,
    max_token_size=_tokenizer.model_max_length,
    func = lambda texts: hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model),
)
rag = minirag.MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama.ollama_model_complete if LLM_MODEL_NAME else None,
    llm_model_name=LLM_MODEL_NAME,
    embedding_func=_embedding_func,
    log_level=LOG_LEVEL,
    suppress_httpx_logging=True,
    entity_presidio_extraction=False,
)
# rag = minirag.MiniRAG(
#     working_dir=WORKING_DIR,
#     llm_model_func=openai_queue_completion,
#     llm_model_max_token_size=200,
#     llm_model_kwargs={"api_key": api_key},
#     # llm_model_name=LLM_MODEL_NAME,
#     llm_model_name="gpt-5-nano",
#     embedding_func=_embedding_func,
#     log_level=LOG_LEVEL,
#     suppress_httpx_logging=True
# )

Init device: cuda
Loading embedding tokenizer/model...


INFO:nano-vectordb:Load (26366, 384) data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\Users\\Francisco Azeredo\\Downloads\\gerador_documentos_gpt_azure (1)\\gerador_documentos_gpt_azure\\storage\\vdb_entities.json'} 26366 data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\Users\\Francisco Azeredo\\Downloads\\gerador_documentos_gpt_azure (1)\\gerador_documentos_gpt_azure\\storage\\vdb_entities.json'} 26366 data
INFO:nano-vectordb:Load (26366, 384) data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\Users\\Francisco Azeredo\\Downloads\\gerador_documentos_gpt_azure (1)\\gerador_documentos_gpt_azure\\storage\\vdb_entities_name.json'} 26366 data
INFO:nano-vectordb:Load (26366, 384) data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\Users\\Francisco Azeredo\\Downloads\\gerador_documentos_gpt_azure (1)\\gerador_documentos_gpt_

In [2]:
import torch
import os, json, random, time, gc
from pathlib import Path
import psutil
from tqdm.auto import tqdm
from docx import Document
# ---------------- User Config ----------------
SHUFFLE_DOCS = True
MAX_DOCS = None  # set int to limit docs
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_DIR = r"C:\Users\Francisco Azeredo\Downloads\gerador_documentos_gpt_azure (1)\gerador_documentos_gpt_azure\curated_docs"
WORKING_DIR = r"C:\Users\Francisco Azeredo\Downloads\gerador_documentos_gpt_azure (1)\gerador_documentos_gpt_azure\storage"
LLM_MODEL_NAME = "qwen2.5:latest"
LOG_LEVEL = "CRITICAL"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)





assert 'rag' in globals(), "rag not found. Run Cell 1 first."
try:
    PROCESS = psutil.Process()
except Exception:
    PROCESS = None
# ---------------- Helpers ----------------

def memory_mb():
    if PROCESS is None: return None
    return PROCESS.memory_info().rss / (1024 * 1024)

def read_text_from_file(path: Path) -> str:
    suffix = path.suffix.lower()
    try:
        if suffix in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
        if suffix == ".json":
            data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
            for k in ("text","content","body","article"):
                if isinstance(data, dict) and k in data and isinstance(data[k], str):
                    return data[k]
            return json.dumps(data)
        if suffix in {".jsonl", ".ndjson"}:
            lines = []
            with path.open("r", encoding="utf-8", errors="ignore") as f:
                for line in f:





                    
                    line=line.strip()
                    if not line: continue
                    try:
                        obj=json.loads(line)
                        if isinstance(obj, dict):
                            for k in ("text","content","body","article"):
                                if k in obj and isinstance(obj[k], str):
                                    lines.append(obj[k]); break
                            else:
                                lines.append(json.dumps(obj))
                        else:
                            lines.append(str(obj))
                    except Exception:
                        lines.append(line)
            return "\n".join(lines)
        if suffix == ".docx":
            # Read .docx files using python-docx
            doc = Document(path)
            text = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    text.append(paragraph.text.strip())
            return "\n".join(text)
    except Exception as e:
        return f"ERROR_READING_FILE {path.name}: {e}"
    return ""

def load_documents(root_dir: str):
    exts = (".txt", ".md", ".json", ".jsonl", ".ndjson", ".docx")
    paths = [p for p in Path(root_dir).rglob("*") if p.suffix.lower() in exts and p.is_file()]
    if SHUFFLE_DOCS: random.shuffle(paths)
    docs = []
    for p in paths:
        if MAX_DOCS and len(docs) >= MAX_DOCS: break
        text = read_text_from_file(p).strip()
        if not text: continue
        docs.append({"id": f"doc_{len(docs)}", "text": text, "source_path": str(p)})
    return docs

# ---------------- Indexing ----------------
async def index_documents(rag):
    print("Loading documents...")
    t0 = time.perf_counter(); docs = load_documents(DATASET_DIR)
    print(f"Loaded {len(docs)} docs in {time.perf_counter()-t0:.2f}s")
    if not docs:
        print("No documents found; adjust DATASET_DIR."); return
    start_mem = memory_mb()
    if start_mem is not None: print(f"Start RSS: {start_mem:.2f} MB")
    texts = [d['text'] for d in docs]
    metas = [{"id": d['id'], "source": d['source_path']} for d in docs]
    print("Indexing with ainsert() ...")
    t1 = time.perf_counter()
    for text, metadata in tqdm(zip(texts, metas), desc="Indexing", total=len(texts)):
        try:
            await rag.ainsert(text, metadata=metadata, file_path=metadata.get("source"))
        except Exception as e:
            print(f"Failed {metadata.get('id')}: {e}")
    dur = time.perf_counter()-t1
    print(f"Inserted {len(texts)} docs in {dur:.2f}s ({len(texts)/dur:.2f} docs/s)")
    gc.collect(); end_mem = memory_mb()
    if end_mem is not None: print(f"End RSS: {end_mem:.2f} MB (Œî {end_mem - start_mem:.2f} MB)")
global rag
await index_documents(rag)
print("Indexing complete. Proceed to Cell 2 for querying & evaluation.")

Device: cuda
Loading documents...
Loaded 20 docs in 0.12s
Start RSS: 1030.40 MB
Indexing with ainsert() ...


Indexing:   0%|          | 0/20 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Inserted 20 docs in 43326.59s (0.00 docs/s)cated), 850 relations(duplicated)
End RSS: 891.28 MB (Œî -139.12 MB)
Indexing complete. Proceed to Cell 2 for querying & evaluation.


In [None]:
# Cell 2: Query & QA Evaluation
# ----------------------------------------------
# Run AFTER Cell 1. Uses the global `rag` object and indexed data.
# Timeout mitigation strategies applied: reduced limit, concise prompts, retries with backoff.

# ----------- Constants & Config - Using Relative Paths --------------
from pathlib import Path

# Updated to use relative paths after reorganization
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

QA_JSON_PATH = str(PROJECT_ROOT / "datasets" / "qa_dataset.json")  # Path to QA dataset (JSON)
OUTPUT_CSV_PATH = str(PROJECT_ROOT / "results_csv")                # Output directory for CSV results
MAX_Q = None                       # Limit number of QA pairs (None = all)
TOP_K = 3                          # Top-K docs to retrieve per query
MAX_RETRIES = 2                    # Max retries per query
PER_QUERY_DEADLINE = 30            # Max seconds per query
TOKEN_LIMIT = 512                  # Token limit for truncation (if used)

# Verify paths
print(f"‚úì Paths configured:")
print(f"  Q&A Dataset: {QA_JSON_PATH}")
print(f"  Output Folder: {OUTPUT_CSV_PATH}")
print(f"  Dataset exists: {Path(QA_JSON_PATH).exists()}")
print(f"  Output folder exists: {Path(OUTPUT_CSV_PATH).exists()}")
# --------------------------------------------

import os, csv, time, json, random, re, statistics, asyncio, math
from uu import Error
from minirag import QueryParam
from minirag.utils import calculate_similarity  # legacy helper (returns indices) ‚Äì not used now
from nltk.metrics import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize, sent_tokenize
from rouge import Rouge
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

TOKEN_SPLIT_RE = re.compile(r"\W+", re.UNICODE)
_ROUGE = None
_BERT_MODEL = None
_SMOOTH = SmoothingFunction().method1

def _lazy_rouge():
    global _ROUGE
    if _ROUGE is None:
        _ROUGE = Rouge()
    return _ROUGE

def _lazy_bert():
    global _BERT_MODEL
    if _BERT_MODEL is None:
        _BERT_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
    return _BERT_MODEL

def tokenize_pt(s: str):
    return [t.lower() for t in TOKEN_SPLIT_RE.split(s) if t.strip()]

def token_recall(answer: str, gold: str) -> float:
    at = set(tokenize_pt(answer))
    gt = set(tokenize_pt(gold))
    if not gt:
        return 0.0
    return len(at & gt) / len(gt)

def jaccard_similarity(answer: str, gold: str) -> float:
    at = set(tokenize_pt(answer))
    gt = set(tokenize_pt(gold))
    union = at | gt
    intersection = at & gt
    if not union:
        return 0.0
    return len(intersection) / len(union)

def compute_rouge1(hyp: str, ref: str):
    if not hyp.strip() or not ref.strip():
        return {"rouge-1": {"f": 0.0, "p": 0.0, "r": 0.0}}
    try:
        return _lazy_rouge().get_scores(hyp, ref)[0]
    except:
        return {"rouge-1": {"f": 0.0, "p": 0.0, "r": 0.0}}

def compute_bleu(hyp: str, ref: str) -> float:
    h_toks = tokenize_pt(hyp)
    r_toks = tokenize_pt(ref)
    if not h_toks or not r_toks:
        return 0.0
    try:
        return sentence_bleu([r_toks], h_toks, smoothing_function=_SMOOTH)
    except:
        return 0.0

def compute_bert_similarity(hyp: str, ref: str) -> float:
    if not hyp.strip() or not ref.strip():
        return 0.0
    model = _lazy_bert()
    emb = model.encode([hyp, ref])
    return float(np.dot(emb[0], emb[1]) / (np.linalg.norm(emb[0]) * np.linalg.norm(emb[1])))

def best_sentence_overlap(answer: str, gold: str) -> float:
    sents_a = sent_tokenize(answer, language="portuguese")
    sents_g = sent_tokenize(gold, language="portuguese")
    best = 0.0
    for sa in sents_a:
        for sg in sents_g:
            tr = token_recall(sa, sg)
            if tr > best:
                best = tr
    return best

def normalize_text(s: str) -> str:
    return s.strip().lower()

# Load QA pairs from JSON
qa_pairs = []
if os.path.exists(QA_JSON_PATH):
    try:
        with open(QA_JSON_PATH, encoding="utf-8") as f:
            qa_data = json.load(f)
            for item in qa_data:
                if "pergunta" in item and "resposta" in item:
                    qa_pairs.append({
                        "question": item["pergunta"].strip(),
                        "gold_answer": item["resposta"].strip(),
                        "context": item.get("contexto","").strip(),
                        "file": item.get("arquivo","").strip()
                    })
    except Exception as e:
        print(f"Error loading QA dataset: {e}")
else:
    print("QA dataset not found. Provide QA_JSON_PATH or ensure qa_dataset.json is present.")

if MAX_Q:
    random.shuffle(qa_pairs)
    qa_pairs = qa_pairs[:MAX_Q]

print(f"Loaded {len(qa_pairs)} Q&A pairs.")

# Evaluation loop
rows = []
latencies = []
correct_retrievals = 0

for idx, pair in enumerate(tqdm(qa_pairs, desc="Evaluating")):
    q = pair["question"]
    gold_answer = pair["gold_answer"]
    gold_context = pair["context"]
    gold_file = pair["file"]
    
    start_t = time.perf_counter()
    
    for retry in range(MAX_RETRIES):
        try:
            # Query with timeout
            async def query_with_timeout():
                return await asyncio.wait_for(
                    rag.aquery(q, param=QueryParam(top_k=TOP_K)),
                    timeout=PER_QUERY_DEADLINE
                )
            
            response_obj = await query_with_timeout()
            
            if not response_obj or not hasattr(response_obj, "answer"):
                print(f"Q#{idx+1}: No valid response. Skipping.")
                break
            
            answer_text = response_obj.answer.strip() if response_obj.answer else ""
            retrieved_context = " ".join([ctx.get("content", "") for ctx in response_obj.context]) if hasattr(response_obj, "context") else ""
            
            # Retrieval metrics
            retrieved_files = [ctx.get("file_path", "") for ctx in response_obj.context] if hasattr(response_obj, "context") else []
            rank = retrieved_files.index(gold_file) if gold_file in retrieved_files else -1
            if rank >= 0:
                correct_retrievals += 1
            
            # Answer quality metrics
            exact = 1.0 if normalize_text(answer_text) == normalize_text(gold_answer) else 0.0
            substring = 1.0 if gold_answer.lower() in answer_text.lower() else 0.0
            tok_recall = token_recall(answer_text, gold_answer)
            jaccard = jaccard_similarity(answer_text, gold_answer)
            rouge1_scores = compute_rouge1(answer_text, gold_answer)
            rouge1_f = rouge1_scores["rouge-1"]["f"]
            bleu_score = compute_bleu(answer_text, gold_answer)
            bert_cos = compute_bert_similarity(answer_text, gold_answer)
            
            lat = time.perf_counter() - start_t
            latencies.append(lat)
            
            rows.append({
                "question": q,
                "gold_answer": gold_answer,
                "generated_answer": answer_text,
                "gold_file": gold_file,
                "retrieved_files": "|".join(retrieved_files),
                "retrieval_rank": rank,
                "exact": exact,
                "substring": substring,
                "token_recall": tok_recall,
                "jaccard": jaccard,
                "rouge1_f": rouge1_f,
                "bleu": bleu_score,
                "bert_cos": bert_cos,
                "latency_s": lat
            })
            break
            
        except asyncio.TimeoutError:
            print(f"Q#{idx+1} timeout (retry {retry+1}/{MAX_RETRIES})")
            if retry == MAX_RETRIES - 1:
                print(f"Q#{idx+1}: Max retries exceeded. Skipping.")
        except Exception as e:
            print(f"Q#{idx+1} error: {e}")
            break

# Results summary
if rows:
    def _avg(key):
        vals = [r[key] for r in rows if key in r and isinstance(r[key], (int,float))]
        return sum(vals)/len(vals) if vals else 0.0
    
    print(f"\n{'='*60}")
    print(f"RAG EVALUATION RESULTS")
    print(f"{'='*60}")
    
    print(f"\nüéØ RETRIEVAL:")
    print(f"  Accuracy: {correct_retrievals}/{len(qa_pairs)} = {correct_retrievals/len(qa_pairs):.2%}")
    
    print(f"\nüìù ANSWER QUALITY:")
    print(f"  Exact Match: {_avg('exact'):.2%}")
    print(f"  Substring: {_avg('substring'):.2%}")
    print(f"  Token Recall: {_avg('token_recall'):.3f}")
    print(f"  ROUGE-1 F1: {_avg('rouge1_f'):.3f}")
    print(f"  BERT Similarity: {_avg('bert_cos'):.3f}")
    
    print(f"\n‚ö° PERFORMANCE:")
    avg_lat = sum(latencies)/len(latencies)
    print(f"  Avg Latency: {avg_lat:.2f}s")
    print(f"  QPS: {1/avg_lat:.2f}")
    
    # Save results
    if OUTPUT_CSV_PATH and rows:
        os.makedirs(OUTPUT_CSV_PATH, exist_ok=True)
        out_file = os.path.join(OUTPUT_CSV_PATH, f"benchmark_light_{len(rows)}.csv")
        with open(out_file, "w", encoding="utf-8", newline="") as csvf:
            writer = csv.DictWriter(csvf, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)
        print(f"\n‚úì Results saved to: {out_file}")
else:
    print("\nNo valid results to report.")


Loaded 20 QA pairs.


Eval-mini:   0%|          | 0/20 [00:00<?, ?q/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Q1: Quais s√£o os principais pontos abordados na proposta de lei sobre a prote√ß√£o de ...
Answer: Infelizmente, n√£o h√° informa√ß√µes dispon√≠veis nas tabelas fornecidas que possam ser usadas diretamente para responder √† sua pergunta sobre a proposta de lei sobr
Gold: A proposta de lei inclui os seguintes pontos principais: 1. Amplia√ß√£o da defini√ß√£o e categoriza√ß√£o dos dados pessoais, incluindo novas formas de tratamento que 
Metrics: {'exact': '0.000', 'substring': '0.000', 'token_recall': '0.588', 'jaccard': '0.148', 'levenshtein': '0.094', 'rouge1_f': '0.253', 'rouge2_f': '0.095', 'overlap': '1.000', 'bleu': '0.021', 'bert_cos': '0.768'} Latency: 257134.6 ms attempts=1
-
Q2: Qual √© o prazo estabelecido para a conclus√£o da consulta p√∫blica e entrega da ve...
Answer: Infelizmente, as informa√ß√µes fornecidas em suas tabelas n√£o cont√™m detalhes espec√≠ficos sobre prazos ou cronogramas para a conclus√£o da consulta p√∫blica e entre
Gold: O prazo para a conclus√£o da consul