In [1]:
# Cell 0: RAG Initialization (Run First)
# -------------------------------------
# Sets up Weaviate collection using text2vec-transformers (external inference container).
# Ensure docker-compose is up with services: weaviate + t2v-transformers.
#   ENABLE_MODULES=text2vec-transformers,generative-ollama
#   DEFAULT_VECTORIZER_MODULE=text2vec-transformers
#   TRANSFORMERS_INFERENCE_API=http://t2v-transformers:8080
# Optional: generative-ollama (Ollama running on host for qwen2m:latest)
# Timeout tuning: Some weaviate client versions expose TimeoutConfig; if not, we fall back.

import weaviate
from weaviate.classes.config import Configure, Property, DataType, Tokenization# List user-site Jupyter packages

# Attempt optional TimeoutConfig (newer weaviate client). If missing, continue with defaults.
try:
    from weaviate.connect import TimeoutConfig  # may not exist in older versions
    TIMEOUTS = TimeoutConfig(init=60, query=180, insert=120)
    client = weaviate.connect_to_local(timeout_config=TIMEOUTS)
    print("Custom timeouts applied:", TIMEOUTS)
except ImportError:
    client = weaviate.connect_to_local()
    TIMEOUTS = None
    print("TimeoutConfig not available in this weaviate version; using default client timeouts.")
except TypeError:
    # Signature mismatch (older version). Reconnect without custom timeouts.
    client = weaviate.connect_to_local()
    TIMEOUTS = None
    print("TimeoutConfig signature unsupported; using default timeouts.")

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "qwen2.5:latest"  # must match the model available to Ollama on host
DATASET_NAME = "AdministrativeDocuments"

# Diagnostics
try:
    meta = client.get_meta()
    print("Server modules detected:", list(meta.get("modules", {}).keys()))
except Exception as e:
    print("Meta fetch failed:", e)

# Recreate collection for a clean slate
try:
    client.collections.delete(DATASET_NAME)
except Exception:
    pass

api_endpoint = "http://host.docker.internal:11434"  # Ollama on host

client.collections.create(
    DATASET_NAME,
    description="Administrative documents repository",
    properties=[
        Property(name="text", data_type=DataType.TEXT, tokenization=Tokenization.LOWERCASE),
        Property(name="file_path", data_type=DataType.TEXT)
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_transformers(
            name="text_vector",
            source_properties=["text"],
            pooling_strategy="masked_mean",
        )
    ],
    generative_config=Configure.Generative.ollama(        
        api_endpoint=api_endpoint,
        model=LLM_MODEL_NAME
    )
)

assert client.collections.exists(DATASET_NAME)
print(f"Collection '{DATASET_NAME}' ready (text2vec-transformers).")

TimeoutConfig not available in this weaviate version; using default client timeouts.
Server modules detected: ['generative-anthropic', 'generative-anyscale', 'generative-aws', 'generative-cohere', 'generative-databricks', 'generative-friendliai', 'generative-google', 'generative-mistral', 'generative-nvidia', 'generative-octoai', 'generative-ollama', 'generative-openai', 'generative-xai', 'multi2multivec-jinaai', 'multi2vec-cohere', 'multi2vec-google', 'multi2vec-jinaai', 'multi2vec-nvidia', 'multi2vec-voyageai', 'reranker-cohere', 'reranker-jinaai', 'reranker-nvidia', 'reranker-voyageai', 'text2multivec-jinaai', 'text2vec-aws', 'text2vec-cohere', 'text2vec-databricks', 'text2vec-google', 'text2vec-huggingface', 'text2vec-jinaai', 'text2vec-mistral', 'text2vec-nvidia', 'text2vec-octoai', 'text2vec-openai', 'text2vec-transformers', 'text2vec-voyageai', 'text2vec-weaviate']


            Use the `vector_config` argument instead.
            


Collection 'AdministrativeDocuments' ready (text2vec-transformers).


In [2]:
import torch
import os, json, random, time, gc
from pathlib import Path
import psutil
from tqdm.auto import tqdm
from docx import Document
# ---------------- User Config ----------------
SHUFFLE_DOCS = True
MAX_DOCS = None  # set int to limit docs
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_DIR = r"C:\Users\Francisco Azeredo\Downloads\gerador_documentos_gpt_azure (1)\gerador_documentos_gpt_azure\documentos_gerados"
WORKING_DIR = r"C:\Users\Francisco Azeredo\Downloads\gerador_documentos_gpt_azure (1)\gerador_documentos_gpt_azure"
LLM_MODEL_NAME = "qwen2.5:latest"
LOG_LEVEL = "CRITICAL"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

try:
    PROCESS = psutil.Process()
except Exception:
    PROCESS = None

# ---------------- Helpers ----------------

def memory_mb():
    if PROCESS is None: return None
    return PROCESS.memory_info().rss / (1024 * 1024)

def read_text_from_file(path: Path) -> str:
    suffix = path.suffix.lower()
    try:
        if suffix in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
        if suffix == ".json":
            data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
            for k in ("text","content","body","article"):
                if isinstance(data, dict) and k in data and isinstance(data[k], str):
                    return data[k]
            return json.dumps(data)
        if suffix in {".jsonl", ".ndjson"}:
            lines = []
            with path.open("r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line=line.strip()
                    if not line: continue
                    try:
                        obj=json.loads(line)
                        if isinstance(obj, dict):
                            for k in ("text","content","body","article"):
                                if k in obj and isinstance(obj[k], str):
                                    lines.append(obj[k]); break
                            else:
                                lines.append(json.dumps(obj))
                        else:
                            lines.append(str(obj))
                    except Exception:
                        lines.append(line)
            return "\n".join(lines)
        if suffix == ".docx":
            # Read .docx files using python-docx
            doc = Document(path)
            text = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    text.append(paragraph.text.strip())
            return "\n".join(text)
    except Exception as e:
        return f"ERROR_READING_FILE {path.name}: {e}"
    return ""

def load_documents(root_dir: str):
    exts = (".txt", ".md", ".json", ".jsonl", ".ndjson", ".docx")
    paths = [p for p in Path(root_dir).rglob("*") if p.suffix.lower() in exts and p.is_file()]
    if SHUFFLE_DOCS: random.shuffle(paths)
    docs = []
    for p in paths:
        if MAX_DOCS and len(docs) >= MAX_DOCS: break
        text = read_text_from_file(p).strip()
        if not text: continue
        docs.append({"id": f"doc_{len(docs)}", "text": text, "source_path": str(p)})
    return docs

# ---------------- Indexing ----------------
async def index_documents(rag):
    print("Loading documents...")
    t0 = time.perf_counter(); docs = load_documents(DATASET_DIR)
    print(f"Loaded {len(docs)} docs in {time.perf_counter()-t0:.2f}s")
    if not docs:
        print("No documents found; adjust DATASET_DIR."); return
    start_mem = memory_mb()
    if start_mem is not None: print(f"Start RSS: {start_mem:.2f} MB")
    texts = [d['text'] for d in docs]
    metas = [{"id": d['id'], "source": d['source_path']} for d in docs]
    print("Indexing (batched)...")
    t1 = time.perf_counter()
    failed = 0
    with rag.batch.dynamic() as batch:
        for text, metadata in tqdm(zip(texts, metas), desc="Indexing", total=len(texts)):
            try:
                batch.add_object(
                    properties={
                        "text": text,
                        "file_path": metadata.get("source")
                    }
                )
            except Exception as e:
                failed += 1
                if failed < 5:
                    print(f"Failed {metadata.get('id')}: {e}")
    dur = time.perf_counter()-t1
    print(f"Inserted {len(texts)-failed} / {len(texts)} docs in {dur:.2f}s ({((len(texts)-failed)/dur) if dur>0 else 0:.2f} docs/s)")
    if failed:
        print(f"Total failed: {failed}")
    gc.collect(); end_mem = memory_mb()
    if end_mem is not None: print(f"End RSS: {end_mem:.2f} MB (Œî {end_mem - start_mem:.2f} MB)")

rag = client.collections.get(DATASET_NAME)
await index_documents(rag)
print("Indexing complete. Proceed to Cell 2 for querying & evaluation.")

Device: cuda
Loading documents...
Loaded 1000 docs in 17.63s
Start RSS: 664.74 MB
Indexing (batched)...


Indexing:   0%|          | 0/1000 [00:00<?, ?it/s]

Inserted 1000 / 1000 docs in 10.53s (95.00 docs/s)
End RSS: 584.45 MB (Œî -80.29 MB)
Indexing complete. Proceed to Cell 2 for querying & evaluation.


In [None]:
# Cell 2: Query & QA Evaluation - Best Metrics Only
# ----------------------------------------------
# Run AFTER Cell 1. Uses the global `rag` object and indexed data.
# Optimized to focus on the most meaningful metrics for RAG evaluation.

import os, csv, time, json, random, re, statistics, asyncio, math
import sys
from pathlib import Path
from nltk.metrics import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize, sent_tokenize
from rouge import Rouge
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

# -------- Configuration - Using Relative Paths --------
# Updated to use relative paths after reorganization
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

# Add parent to path for imports
sys.path.insert(0, str(PROJECT_ROOT))

QA_JSON_PATH = str(PROJECT_ROOT / "datasets" / "qa_dataset.json")
OUTPUT_CSV_PATH = str(PROJECT_ROOT / "results")  # set to None to skip saving
TOP_K = 3            # lower to reduce vector fetch time
MAX_Q = None         # limit question count
RANDOM_SEED = 42
USE_BERT_SIM = True  # semantic metrics cost
PER_QUERY_DEADLINE = 60.0  # seconds, must be < client.query timeout
MAX_RETRIES = 2
RETRY_BACKOFF = 5.0  # seconds added each retry
PROMPT_PREFIX = "Answer briefly: "  # keep prompt short -> faster generation
COT_PROMPT = True  # chain-of-thought prompting (slower, may improve complex Qs)
random.seed(RANDOM_SEED)

# Verify paths
print(f"‚úì Paths configured:")
print(f"  Q&A Dataset: {QA_JSON_PATH}")
print(f"  Output Folder: {OUTPUT_CSV_PATH}")
print(f"  Dataset exists: {Path(QA_JSON_PATH).exists()}")
print(f"  Output folder exists: {Path(OUTPUT_CSV_PATH).exists()}")

TOKEN_SPLIT_RE = re.compile(r"\W+", re.UNICODE)
_ROUGE = None
_BERT_MODEL = None
_SMOOTH = SmoothingFunction().method1

def _lazy_rouge():
    global _ROUGE
    if _ROUGE is None:
        _ROUGE = Rouge()
    return _ROUGE

def _lazy_bert():
    global _BERT_MODEL
    if _BERT_MODEL is None:
        _BERT_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
    return _BERT_MODEL

def normalize_text(s: str) -> str:
    return s.strip().lower()

def tokenize_pt(s: str):
    return [t.lower() for t in TOKEN_SPLIT_RE.split(s) if t.strip()]

def token_recall(answer: str, gold: str) -> float:
    at = set(tokenize_pt(answer))
    gt = set(tokenize_pt(gold))
    if not gt:
        return 0.0
    return len(at & gt) / len(gt)

def compute_rouge1(hyp: str, ref: str):
    if not hyp.strip() or not ref.strip():
        return {"rouge-1": {"f": 0.0, "p": 0.0, "r": 0.0}}
    try:
        return _lazy_rouge().get_scores(hyp, ref)[0]
    except:
        return {"rouge-1": {"f": 0.0, "p": 0.0, "r": 0.0}}

def compute_bleu(hyp: str, ref: str) -> float:
    h_toks = tokenize_pt(hyp)
    r_toks = tokenize_pt(ref)
    if not h_toks or not r_toks:
        return 0.0
    try:
        return sentence_bleu([r_toks], h_toks, smoothing_function=_SMOOTH)
    except:
        return 0.0

def compute_bert_similarity(hyp: str, ref: str) -> float:
    if not USE_BERT_SIM or not hyp.strip() or not ref.strip():
        return 0.0
    model = _lazy_bert()
    emb = model.encode([hyp, ref])
    return float(np.dot(emb[0], emb[1]) / (np.linalg.norm(emb[0]) * np.linalg.norm(emb[1])))

def best_sentence_overlap(answer: str, gold: str) -> float:
    sents_a = sent_tokenize(answer, language="portuguese")
    sents_g = sent_tokenize(gold, language="portuguese")
    best = 0.0
    for sa in sents_a:
        for sg in sents_g:
            tr = token_recall(sa, sg)
            if tr > best:
                best = tr
    return best

# Load QA
qa_pairs = []
if os.path.exists(QA_JSON_PATH):
    with open(QA_JSON_PATH, encoding="utf-8") as f:
        qa_data = json.load(f)
        for item in qa_data:
            if "pergunta" in item and "resposta" in item:
                qa_pairs.append({
                    "question": item["pergunta"].strip(),
                    "gold_answer": item["resposta"].strip(),
                    "context": item.get("contexto","").strip(),
                    "file": item.get("arquivo","").strip()
                })
else:
    print("QA JSON not found. Check QA_JSON_PATH.")

if MAX_Q:
    random.shuffle(qa_pairs)
    qa_pairs = qa_pairs[:MAX_Q]

print(f"Loaded {len(qa_pairs)} Q&A pairs.")

rows = []
latencies = []
correct_retrievals = 0
cot_phrase = " Let's think step by step." if COT_PROMPT else ""

for idx, pair in enumerate(tqdm(qa_pairs, desc="Evaluating (Best Metrics)")):
    q = pair["question"]
    gold_answer = pair["gold_answer"]
    gold_context = pair["context"]
    gold_file = pair["file"]
    
    start_t = time.perf_counter()
    try:
        response_obj = rag.query(PROMPT_PREFIX + q + cot_phrase, param={"limit": TOP_K})
        
        if not response_obj or not hasattr(response_obj, "answer"):
            print(f"Q#{idx+1}: No valid response. Skipping.")
            continue
        
        answer_text = response_obj.answer.strip() if response_obj.answer else ""
        retrieved_context = " ".join([ctx.get("content", "") for ctx in response_obj.context]) if hasattr(response_obj, "context") else ""
        
        # Retrieval metrics
        retrieved_files = [ctx.get("file_path", "") for ctx in response_obj.context] if hasattr(response_obj, "context") else []
        rank = retrieved_files.index(gold_file) if gold_file in retrieved_files else -1
        if rank >= 0:
            correct_retrievals += 1
        
        # Answer quality (best metrics)
        exact = 1.0 if normalize_text(answer_text) == normalize_text(gold_answer) else 0.0
        substring = 1.0 if gold_answer.lower() in answer_text.lower() else 0.0
        tok_recall = token_recall(answer_text, gold_answer)
        rouge1_scores = compute_rouge1(answer_text, gold_answer)
        rouge1_f = rouge1_scores["rouge-1"]["f"]
        bert_cos = compute_bert_similarity(answer_text, gold_answer) if USE_BERT_SIM else 0.0
        
        # Context quality
        context_token_recall = token_recall(retrieved_context, gold_context) if gold_context else 0.0
        context_rouge1 = compute_rouge1(retrieved_context, gold_context) if gold_context else {"rouge-1": {"f": 0.0}}
        context_bert = compute_bert_similarity(retrieved_context, gold_context) if USE_BERT_SIM and gold_context else 0.0
        
        lat = time.perf_counter() - start_t
        latencies.append(lat)
        
        rows.append({
            "question": q,
            "gold_answer": gold_answer,
            "generated_answer": answer_text,
            "gold_file": gold_file,
            "retrieved_files": "|".join(retrieved_files),
            "retrieval_rank": rank,
            "exact": exact,
            "substring": substring,
            "token_recall": tok_recall,
            "rouge1_f": rouge1_f,
            "bert_cos": bert_cos,
            "context_token_recall": context_token_recall,
            "context_rouge1_f": context_rouge1["rouge-1"]["f"],
            "context_bert_cos": context_bert,
            "latency_s": lat
        })
        
    except Exception as e:
        print(f"Q#{idx+1} error: {e}")
        continue

# Results summary
if rows:
    def _avg(key):
        vals = [r[key] for r in rows if key in r and isinstance(r[key], (int,float))]
        return sum(vals)/len(vals) if vals else 0.0
    
    # Report best metrics only
    print(f"\n{'='*60}")
    print(f"RAG EVALUATION RESULTS - BEST METRICS ONLY")
    print(f"{'='*60}")
    
    print(f"\nüéØ RETRIEVAL PERFORMANCE:")
    print(f"  Document Retrieval Accuracy: {correct_retrievals}/{len(qa_pairs)} = {correct_retrievals/len(qa_pairs):.2%}")
    print(f"  Average Retrieval Rank: {_avg('retrieval_rank'):.1f}")
    
    print(f"\nüìù ANSWER QUALITY:")
    print(f"  Exact Match: {_avg('exact'):.2%}")
    print(f"  Substring Match: {_avg('substring'):.2%}")
    print(f"  Token Recall: {_avg('token_recall'):.3f}")
    print(f"  ROUGE-1 F1: {_avg('rouge1_f'):.3f}")
    if 'bert_cos' in rows[0]:
        print(f"  BERT Similarity: {_avg('bert_cos'):.3f}")
    
    print(f"\nüîç CONTEXT QUALITY:")
    if 'context_token_recall' in rows[0]:
        print(f"  Context Token Recall: {_avg('context_token_recall'):.3f}")
    if 'context_rouge1_f' in rows[0]:
        print(f"  Context ROUGE-1 F1: {_avg('context_rouge1_f'):.3f}")
    if 'context_bert_cos' in rows[0]:
        print(f"  Context BERT Similarity: {_avg('context_bert_cos'):.3f}")
    
    print(f"\n‚ö° PERFORMANCE:")
    avg_lat = sum(latencies)/len(latencies)
    p95 = statistics.quantiles(latencies, n=20)[18] if len(latencies)>=20 else max(latencies)
    print(f"  Average Latency: {avg_lat*1000:.1f}ms")
    print(f"  95th Percentile Latency: {p95*1000:.1f}ms")
    print(f"  Questions per Second: {1/avg_lat:.2f}")
    
    # Save results
    if OUTPUT_CSV_PATH and rows:
        os.makedirs(OUTPUT_CSV_PATH, exist_ok=True)
        mode = "AgentCoT" if COT_PROMPT else "Agent"
        n = len(rows)
        out_file = os.path.join(OUTPUT_CSV_PATH, f"{mode}_{n}.csv")
        with open(out_file, "w", encoding="utf-8", newline="") as csvf:
            writer = csv.DictWriter(csvf, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)
        print(f"\n‚úì Results saved to: {out_file}")
else:
    print("\nNo valid results to report.")


In [None]:
# Test connections before running benchmark
import requests
import time

print("üîç Testing connections...")

# Test Ollama
try:
    response = requests.get("http://localhost:11434/api/tags", timeout=5)
    if response.status_code == 200:
        models = response.json().get('models', [])
        print(f"‚úÖ Ollama is running with {len(models)} models")
        qwen_available = any('qwen2.5' in model.get('name', '') for model in models)
        if qwen_available:
            print("‚úÖ qwen2.5 model is available")
        else:
            print("‚ùå qwen2.5 model not found - run: ollama pull qwen2.5:latest")
    else:
        print(f"‚ùå Ollama responded with status {response.status_code}")
except Exception as e:
    print(f"‚ùå Ollama connection failed: {e}")
    print("Run: ollama serve")

# Test Weaviate
try:
    if 'client' in globals():
        meta = client.get_meta()
        print(f"‚úÖ Weaviate is running, modules: {list(meta.get('modules', {}).keys())}")
        
        # Test collection exists
        if client.collections.exists("Dataset"):
            collection = client.collections.get("Dataset")
            aggregate = collection.aggregate.over_all(total_count=True)
            count = aggregate.total_count
            print(f"‚úÖ Dataset collection has {count} documents")
            
            # Simple test query (no generation)
            try:
                test_result = collection.query.near_text(
                    query="test", 
                    limit=1,
                    return_metadata=['distance']
                )
                print("‚úÖ Vector search is working")
            except Exception as e:
                print(f"‚ùå Vector search failed: {e}")
        else:
            print("‚ùå Dataset collection not found")
    else:
        print("‚ùå Weaviate client not initialized")
except Exception as e:
    print(f"‚ùå Weaviate connection failed: {e}")

print("\nüéØ If all tests pass, you can proceed with the benchmark.")
print("üéØ If tests fail, fix the issues before running the evaluation.")