In [None]:
# Cell 0: RAG Initialization (Run First)
# -------------------------------------
# Sets up Weaviate collection using text2vec-transformers (external inference container).
# Ensure docker-compose is up with services: weaviate + t2v-transformers.
#   ENABLE_MODULES=text2vec-transformers,generative-ollama
#   DEFAULT_VECTORIZER_MODULE=text2vec-transformers
#   TRANSFORMERS_INFERENCE_API=http://t2v-transformers:8080
# Optional: generative-ollama (Ollama running on host for qwen2m:latest)
# Timeout tuning: Some weaviate client versions expose TimeoutConfig; if not, we fall back.

import weaviate
from weaviate.classes.config import Configure, Property, DataType, Tokenization

# Attempt optional TimeoutConfig (newer weaviate client). If missing, continue with defaults.
try:
    from weaviate.connect import TimeoutConfig  # may not exist in older versions
    TIMEOUTS = TimeoutConfig(init=60, query=180, insert=120)
    client = weaviate.connect_to_local(timeout_config=TIMEOUTS)
    print("Custom timeouts applied:", TIMEOUTS)
except ImportError:
    client = weaviate.connect_to_local()
    TIMEOUTS = None
    print("TimeoutConfig not available in this weaviate version; using default client timeouts.")
except TypeError:
    # Signature mismatch (older version). Reconnect without custom timeouts.
    client = weaviate.connect_to_local()
    TIMEOUTS = None
    print("TimeoutConfig signature unsupported; using default timeouts.")

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "qwen2.5:latest"  # must match the model available to Ollama on host
NAME_DATASET = "LiHuaWorld"

# Diagnostics
try:
    meta = client.get_meta()
    print("Server modules detected:", list(meta.get("modules", {}).keys()))
except Exception as e:
    print("Meta fetch failed:", e)

# Recreate collection for a clean slate
try:
    client.collections.delete(NAME_DATASET)
except Exception:
    pass

api_endpoint = "http://host.docker.internal:11434"  # Ollama on host

client.collections.create(
    NAME_DATASET,
    description="A collection of time-indexed stories for LiHua. The time is indexed in the file_path property. And the text property contains the story content.",
    properties=[
        Property(name="text", data_type=DataType.TEXT, tokenization=Tokenization.LOWERCASE),
        Property(name="file_path", data_type=DataType.TEXT)
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_transformers(
            name="text_vector",
            source_properties=["text"],
            pooling_strategy="masked_mean",
        )
    ],
    generative_config=Configure.Generative.ollama(
        api_endpoint=api_endpoint,
        model=LLM_MODEL_NAME
    )
)

assert client.collections.exists(NAME_DATASET)
print(f"Collection '{NAME_DATASET}' ready (text2vec-transformers).")

TimeoutConfig not available in this weaviate version; using default client timeouts.
Server modules detected: ['generative-anthropic', 'generative-anyscale', 'generative-aws', 'generative-cohere', 'generative-databricks', 'generative-friendliai', 'generative-google', 'generative-mistral', 'generative-nvidia', 'generative-octoai', 'generative-ollama', 'generative-openai', 'generative-xai', 'multi2multivec-jinaai', 'multi2vec-cohere', 'multi2vec-google', 'multi2vec-jinaai', 'multi2vec-nvidia', 'multi2vec-voyageai', 'reranker-cohere', 'reranker-jinaai', 'reranker-nvidia', 'reranker-voyageai', 'text2multivec-jinaai', 'text2vec-aws', 'text2vec-cohere', 'text2vec-databricks', 'text2vec-google', 'text2vec-huggingface', 'text2vec-jinaai', 'text2vec-mistral', 'text2vec-nvidia', 'text2vec-octoai', 'text2vec-openai', 'text2vec-transformers', 'text2vec-voyageai', 'text2vec-weaviate']


UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': "'LiHua World' is not a valid class name"}]}.

In [None]:
import torch
import os, json, random, time, gc
from pathlib import Path
import psutil
from tqdm.auto import tqdm
# ---------------- User Config ----------------
SHUFFLE_DOCS = True
MAX_DOCS = None  # set int to limit docs
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_DIR = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\dataset\\LiHua-World\\data\\"
WORKING_DIR = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\storage"
LLM_MODEL_NAME = "qwen2m:latest"
LOG_LEVEL = "CRITICAL"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

try:
    PROCESS = psutil.Process()
except Exception:
    PROCESS = None

# ---------------- Helpers ----------------

def memory_mb():
    if PROCESS is None: return None
    return PROCESS.memory_info().rss / (1024 * 1024)

def read_text_from_file(path: Path) -> str:
    suffix = path.suffix.lower()
    try:
        if suffix in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
        if suffix == ".json":
            data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
            for k in ("text","content","body","article"):
                if isinstance(data, dict) and k in data and isinstance(data[k], str):
                    return data[k]
            return json.dumps(data)
        if suffix in {".jsonl", ".ndjson"}:
            lines = []
            with path.open("r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line=line.strip()
                    if not line: continue
                    try:
                        obj=json.loads(line)
                        if isinstance(obj, dict):
                            for k in ("text","content","body","article"):
                                if k in obj and isinstance(obj[k], str):
                                    lines.append(obj[k]); break
                            else:
                                lines.append(json.dumps(obj))
                        else:
                            lines.append(str(obj))
                    except Exception:
                        lines.append(line)
            return "\n".join(lines)
    except Exception as e:
        return f"ERROR_READING_FILE {path.name}: {e}"
    return ""

def load_documents(root_dir: str):
    exts = (".txt", ".md", ".json", ".jsonl", ".ndjson")
    paths = [p for p in Path(root_dir).rglob("*") if p.suffix.lower() in exts and p.is_file()]
    if SHUFFLE_DOCS: random.shuffle(paths)
    docs = []
    for p in paths:
        if MAX_DOCS and len(docs) >= MAX_DOCS: break
        text = read_text_from_file(p).strip()
        if not text: continue
        docs.append({"id": f"doc_{len(docs)}", "text": text, "source_path": str(p)})
    return docs

# ---------------- Indexing ----------------
async def index_documents(rag):
    print("Loading documents...")
    t0 = time.perf_counter(); docs = load_documents(DATASET_DIR)
    print(f"Loaded {len(docs)} docs in {time.perf_counter()-t0:.2f}s")
    if not docs:
        print("No documents found; adjust DATASET_DIR."); return
    start_mem = memory_mb()
    if start_mem is not None: print(f"Start RSS: {start_mem:.2f} MB")
    texts = [d['text'] for d in docs]
    metas = [{"id": d['id'], "source": d['source_path']} for d in docs]
    print("Indexing (batched)...")
    t1 = time.perf_counter()
    failed = 0
    with rag.batch.dynamic() as batch:
        for text, metadata in tqdm(zip(texts, metas), desc="Indexing", total=len(texts)):
            try:
                batch.add_object(
                    properties={
                        "text": text,
                        "file_path": metadata.get("source")
                    }
                )
            except Exception as e:
                failed += 1
                if failed < 5:
                    print(f"Failed {metadata.get('id')}: {e}")
    dur = time.perf_counter()-t1
    print(f"Inserted {len(texts)-failed} / {len(texts)} docs in {dur:.2f}s ({((len(texts)-failed)/dur) if dur>0 else 0:.2f} docs/s)")
    if failed:
        print(f"Total failed: {failed}")
    gc.collect(); end_mem = memory_mb()
    if end_mem is not None: print(f"End RSS: {end_mem:.2f} MB (Δ {end_mem - start_mem:.2f} MB)")

rag = client.collections.get(NAME_DATASET)
await index_documents(rag)
print("Indexing complete. Proceed to Cell 2 for querying & evaluation.")

Device: cuda
Loading documents...
Loaded 442 docs in 0.08s
Start RSS: 519.76 MB
Indexing (batched)...


Indexing:   0%|          | 0/442 [00:00<?, ?it/s]

Inserted 442 / 442 docs in 7.27s (60.82 docs/s)
End RSS: 525.43 MB (Δ 5.66 MB)
Indexing complete. Proceed to Cell 2 for querying & evaluation.


In [3]:
# Cell 2: Query & QA Evaluation
# ----------------------------------------------
# Run AFTER Cell 1. Uses the global `rag` object and indexed data.
# Timeout mitigation strategies applied: reduced limit, concise prompts, retries with backoff.

import os, csv, time, json, random, re, statistics, asyncio, math
from pathlib import Path
from minirag import QueryParam
from minirag.utils import calculate_similarity  # legacy helper (returns indices) – not used now
from nltk.metrics import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize, sent_tokenize
from rouge import Rouge
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

# -------- Configuration --------
QA_CSV_PATH = r"C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\dataset\LiHua-World\qa\query_set.csv"
OUTPUT_CSV_PATH = r"C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\notebooks"  # set to None to skip saving
TOP_K = 4            # lower to reduce vector fetch time
MAX_Q = None         # limit question count
RANDOM_SEED = 42
USE_BERT_SIM = True  # semantic metrics cost
PER_QUERY_DEADLINE = 55.0  # seconds, must be < client.query timeout
MAX_RETRIES = 2
RETRY_BACKOFF = 5.0  # seconds added each retry
PROMPT_PREFIX = "Answer briefly: "  # keep prompt short -> faster generation
random.seed(RANDOM_SEED)

TOKEN_SPLIT_RE = re.compile(r"\W+", re.UNICODE)
_ROUGE = None
_BERT_MODEL = None
_SMOOTH = SmoothingFunction().method1

def _lazy_rouge():
    global _ROUGE
    if _ROUGE is None:
        _ROUGE = Rouge()
    return _ROUGE

def _lazy_bert():
    global _BERT_MODEL
    if _BERT_MODEL is None:
        _BERT_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
    return _BERT_MODEL

def normalize_text(s: str) -> str:
    return TOKEN_SPLIT_RE.sub(" ", s.lower()).strip()

def token_set(s: str) -> set[str]:
    return {t for t in normalize_text(s).split() if t}

def calculate_best_similarity(sentences: list[str], target: str, method="levenshtein", n=1):
    if not sentences:
        return 0.0
    tgt_tokens = target.lower().split()
    scores = []
    if method == "jaccard":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_tokens = set(s.lower().split())
            inter = set(s_tokens) & tgt_set
            union = set(s_tokens) | tgt_set
            scores.append(len(inter) / len(union) if union else 0.0)
    elif method == "levenshtein":
        tgt_len = max(len(tgt_tokens), 1)
        for s in sentences:
            dist = edit_distance(tgt_tokens, s.lower().split())
            norm = max(tgt_len, len(s.split()))
            scores.append(1 - dist / norm if norm else 0.0)
    elif method == "rouge":
        key = f"rouge-{n}"
        r_inst = _lazy_rouge()
        for s in sentences:
            r = r_inst.get_scores(s, target)
            scores.append(r[0].get(key, {}).get("f", 0.0))
    elif method == "bert":
        model = _lazy_bert()
        embeddings = model.encode(sentences + [target], show_progress_bar=False)
        tgt_vec = embeddings[-1]
        tgt_norm = np.linalg.norm(tgt_vec)
        for i in range(len(sentences)):
            v = embeddings[i]
            denom = (np.linalg.norm(v) * tgt_norm)
            scores.append(float(np.dot(v, tgt_vec) / denom) if denom else 0.0)
    elif method == "overlap":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_set = set(s.lower().split())
            inter = s_set & tgt_set
            denom = min(len(s_set), len(tgt_set))
            scores.append(len(inter) / denom if denom else 0.0)
    elif method == "bleu":
        tgt_bleu = word_tokenize(target.lower())
        for s in sentences:
            s_bleu = word_tokenize(s.lower())
            scores.append(sentence_bleu([tgt_bleu], s_bleu, smoothing_function=_SMOOTH))
    else:
        raise ValueError("Unsupported method.")
    return max(scores) if scores else 0.0

def compute_similarity(answer: str, gold: str, use_bert: bool = True) -> dict:
    sentences = sent_tokenize(answer)
    return {
        'jaccard': calculate_best_similarity(sentences, gold, method="jaccard"),
        'levenshtein': calculate_best_similarity(sentences, gold, method="levenshtein"),
        'rouge1_f': calculate_best_similarity(sentences, gold, method="rouge", n=1),
        'rouge2_f': calculate_best_similarity(sentences, gold, method="rouge", n=2),
        'overlap': calculate_best_similarity(sentences, gold, method="overlap"),
        'bleu': calculate_best_similarity(sentences, gold, method="bleu"),
        'bert_cos': calculate_best_similarity(sentences, gold, method="bert") if use_bert else None,
    }

def compute_metrics(answer: str, gold: str) -> dict:
    a_norm, g_norm = normalize_text(answer), normalize_text(gold)
    exact = bool(g_norm) and a_norm == g_norm
    substring = bool(g_norm) and g_norm in a_norm
    ts_a, ts_g = token_set(answer), token_set(gold)
    token_recall = (len(ts_a & ts_g) / len(ts_g)) if ts_g else 0.0
    sim = compute_similarity(answer, gold, use_bert=USE_BERT_SIM)
    if sim.get('bert_cos') is None:
        sim.pop('bert_cos', None)
    return {'exact': exact, 'substring': substring, 'token_recall': token_recall, **sim}

# -------- Load QA Pairs --------
qa_pairs = []
if os.path.exists(QA_CSV_PATH):
    with open(QA_CSV_PATH, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if "Question" in row and "Gold Answer" in row:
                qa_pairs.append((row["Question"].strip(), row["Gold Answer"].strip()))
else:
    print("QA CSV not found. Provide QA_CSV_PATH or create synthetic pairs manually.")
if MAX_Q:
    qa_pairs = qa_pairs[:MAX_Q]
print(f"Loaded {len(qa_pairs)} QA pairs.")
if not qa_pairs:
    raise SystemExit("No QA data available.")

from weaviate.classes.query import MetadataQuery
from weaviate.exceptions import WeaviateQueryError

async def run_eval(mode, n):
    rows, latencies = [], []
    for i, (question, gold) in enumerate(tqdm(qa_pairs, total=len(qa_pairs), desc=f"Eval-{mode}", unit="q"), start=1):
        attempt = 0
        answer = None
        start_overall = time.perf_counter()
        last_error = None
        while attempt <= MAX_RETRIES and (time.perf_counter() - start_overall) < PER_QUERY_DEADLINE and not answer:
            attempt += 1
            t0 = time.perf_counter()
            try:
                # output = rag.generate.near_text(
                #     query=question,
                #     limit=TOP_K,
                #     target_vector="text_vector",
                #     grouped_task=PROMPT_PREFIX + question[:512],  # concise prompt
                #     return_metadata=MetadataQuery(distance=True)
                # )
                
                output = rag.generate.bm25(
                    query=question,
                    limit=TOP_K,
                    grouped_task=PROMPT_PREFIX + question,  # concise prompt
                    grouped_properties=["text"],
                    return_metadata=MetadataQuery(distance=True)
                )
                gen = getattr(output, 'generative', None)
                if isinstance(gen, dict):
                    answer = gen.get('groupedResult') or gen.get('singleResult')
                else:
                    answer = getattr(gen, 'text', None)
                if not answer:
                    # fallback: concatenate retrieved texts
                    retrieved = []
                    for obj in getattr(output, 'objects', [])[:2]:
                        try:
                            retrieved.append(obj.properties.get('text', '')[:600])
                        except Exception:
                            pass
                    answer = (" \n---\n".join(retrieved) or "[no answer]")
            except WeaviateQueryError as e:
                last_error = e
                if attempt <= MAX_RETRIES:
                    time.sleep(RETRY_BACKOFF * attempt)
            except Exception as e:
                last_error = e
                if attempt <= MAX_RETRIES:
                    time.sleep(RETRY_BACKOFF * attempt)
            finally:
                latency = time.perf_counter() - t0
        total_elapsed = time.perf_counter() - start_overall
        if not answer:
            answer = f"[timeout after {total_elapsed:.1f}s attempts={attempt} last_error={last_error}]"
        latencies.append(total_elapsed)
        m = compute_metrics(answer, gold)
        rows.append({"question": question, "gold": gold, "answer": answer, "latency_s": total_elapsed, **m})
        if i <= 2:
            tqdm.write(f"Q{i}: {question[:80]}...")
            tqdm.write("Answer: " + answer[:160].replace('\n',' '))
            tqdm.write("Gold: " + gold[:160])
            fmt = {k: (f"{v:.3f}" if isinstance(v,(int,float)) and not (isinstance(v,float) and math.isnan(v)) else v) for k,v in m.items()}
            tqdm.write(f"Metrics: {fmt} Latency: {total_elapsed*1000:.1f} ms attempts={attempt}")
            tqdm.write('-')
    def _avg(key):
        vals = [r[key] for r in rows if key in r and isinstance(r[key], (int,float))]
        return sum(vals)/len(vals) if vals else 0.0
    print(f"\nAggregate: exact={_avg('exact'):.2%} substring={_avg('substring'):.2%} token_recall={_avg('token_recall'):.2%}")
    for mkey in ['jaccard','levenshtein','rouge1_f','rouge2_f','overlap','bleu','bert_cos']:
        if mkey in rows[0]:
            print(f"  {mkey}: {_avg(mkey):.3f}")
    avg_lat = sum(latencies)/len(latencies)
    p95_lat = sorted(latencies)[int(len(latencies)*0.95)-1] if len(latencies)>1 else latencies[0]
    print(f"Latency: avg={avg_lat*1000:.1f} ms p95={p95_lat*1000:.1f} ms")
    if OUTPUT_CSV_PATH and rows:
        os.makedirs(OUTPUT_CSV_PATH, exist_ok=True)
        out_file = os.path.join(OUTPUT_CSV_PATH, f"results_{mode}{n}.csv")
        write_header = not os.path.exists(out_file)
        with open(out_file,'a',encoding='utf-8',newline='') as f:
            w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
            if write_header: w.writeheader()
            w.writerows(rows)
        print("Saved results to", out_file)
    return rows

# Run evaluation
eval_results1 = await run_eval("light", 7)
print("Evaluation complete.")
client.close()

Loaded 637 QA pairs.


Eval-light:   0%|          | 0/637 [00:00<?, ?q/s]

Q1: Did Adam Smith send a message to Li Hua about the upcoming building maintenance ...
Answer: [timeout after 75.0s attempts=2 last_error=Query call with protocol GRPC search failed with message Deadline Exceeded.]
Gold: Yes
Metrics: {'exact': '0.000', 'substring': '0.000', 'token_recall': '0.000', 'jaccard': '0.000', 'levenshtein': '0.000', 'rouge1_f': '0.000', 'rouge2_f': '0.000', 'overlap': '0.000', 'bleu': '0.000', 'bert_cos': '-0.028'} Latency: 75015.9 ms attempts=2
-
Q2: Did Wolfgang ask Li Hua about watching "Star Wars: A New Hope" after he asked Li...
Answer: No, Wolfgang did not ask Li Hua about watching "Star Wars: A New Hope" after asking about going to see "Overwatch 3." The conversation in the provided text only
Gold: Yes
Metrics: {'exact': '0.000', 'substring': '0.000', 'token_recall': '0.000', 'jaccard': '0.000', 'levenshtein': '0.000', 'rouge1_f': '0.000', 'rouge2_f': '0.000', 'overlap': '0.000', 'bleu': '0.000', 'bert_cos': '0.097'} Latency: 17600.8 ms attempts=1
-
Q2