In [4]:
# Cell 0: RAG Initialization (Run First)
# -------------------------------------
# Loads embedding model, builds embedding_func, and instantiates a MiniRAG object.
# Does NOT ingest documents. Use the next cell to index.

import os, torch, sys
import minirag
from transformers import AutoTokenizer, AutoModel
from minirag.llm.hf import hf_embed
from minirag.utils import EmbeddingFunc
from minirag.llm import ollama
from minirag.llm.openai import openai_complete, openai_queue_completion
from minirag import MiniRAG
from tqdm.auto import tqdm
import dotenv

dotenv.load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not set in environment. Set it before running this cell.")

sys.path.append(r'c:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\Chatbot\lightrag')

# Core configuration (shared by later cells)
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
WORKING_DIR = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\storage"
LLM_MODEL_NAME = "qwen2m:latest"  # set to None if no local Ollama model
LOG_LEVEL = "CRITICAL"

os.makedirs(WORKING_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Init device:", device)

print("Loading embedding tokenizer/model...")
_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
_embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(device)
_embed_model.eval()

async def _embed_batch(texts: list[str]):
    return await hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model)

async def _embed_dispatch(input_text):
    if isinstance(input_text, str):
        return (await _embed_batch([input_text]))[0]


        
    if isinstance(input_text, (list, tuple)) and all(isinstance(t, str) for t in input_text):
        return await _embed_batch(list(input_text))
    raise TypeError(f"Unsupported input type for embedding_func: {type(input_text)}")

_embedding_func = EmbeddingFunc(
    embedding_dim=_embed_model.config.hidden_size,
    max_token_size=_tokenizer.model_max_length,
    func = lambda texts: hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model),
)
# rag = minirag.MiniRAG(
#     working_dir=WORKING_DIR,
#     llm_model_func=ollama.ollama_model_complete if LLM_MODEL_NAME else None,
#     llm_model_name=LLM_MODEL_NAME,
#     embedding_func=_embedding_func,
#     log_level=LOG_LEVEL,
#     suppress_httpx_logging=True
# )
rag = minirag.MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=openai_queue_completion,
    llm_model_max_token_size=200,
    llm_model_kwargs={"api_key": api_key},
    # llm_model_name=LLM_MODEL_NAME,
    llm_model_name="gpt-5-nano",
    embedding_func=_embedding_func,
    log_level=LOG_LEVEL,
    suppress_httpx_logging=True
)

Init device: cuda
Loading embedding tokenizer/model...


INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\\\Users\\\\Francisco Azeredo\\\\OneDrive\\\\Documents\\\\tecnico\\\\5 ano\\\\tese\\\\Código\\\\MiniRAG\\\\notebooks\\\\storage\\vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\\\Users\\\\Francisco Azeredo\\\\OneDrive\\\\Documents\\\\tecnico\\\\5 ano\\\\tese\\\\Código\\\\MiniRAG\\\\notebooks\\\\storage\\vdb_entities_name.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\\\Users\\\\Francisco Azeredo\\\\OneDrive\\\\Documents\\\\tecnico\\\\5 ano\\\\tese\\\\Código\\\\MiniRAG\\\\notebooks\\\\storage\\vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\\\Users\\\\Francisco Azeredo\\\\OneDrive\\\\Documents\\\\tecnico\\\\5 ano\\\\tese\\\\Código\\\\MiniRAG\\\\notebooks\\\\storage\\vdb_chunks.json'} 0 data


In [5]:
import os, time, json, random, gc, asyncio
from pathlib import Path
import psutil, torch
import minirag
from minirag.llm import ollama
from minirag.utils import EmbeddingFunc
from transformers import AutoTokenizer, AutoModel
from minirag.llm.hf import hf_embed
import dotenv

dotenv.load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
"""
Cell 1: Document Ingestion / Indexing Only
-----------------------------------------
Uses openai_queue_completion so LLM calls are queued into Batch API.
We schedule a controlled number of concurrent ainsert tasks (IN_FLIGHT_LIMIT)
so memory stays bounded. Each flush prints: batch id, size, cumulative totals.
"""
# ---------------- User Config ----------------
RANDOM_SEED = 42
SHUFFLE_DOCS = True
MAX_DOCS = None
DATASET_DIR = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\dataset\\LiHua-World\\data\\"
IN_FLIGHT_LIMIT = 10          # max concurrent rag.ainsert tasks
TARGET_BATCH_QUEUE = 350      # flush once queued pending >= this (manager max_batch_size may also trigger)
FINAL_FLUSH = True
STATUS_EVERY = 60.0           # seconds

random.seed(RANDOM_SEED)

try:
    PROCESS = psutil.Process()
except Exception:
    PROCESS = None

def memory_mb():
    return PROCESS.memory_info().rss / 1048576 if PROCESS else None

def read_text_from_file(path: Path) -> str:
    suf = path.suffix.lower()
    try:
        if suf in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
        if suf == ".json":
            data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
            if isinstance(data, dict):
                for k in ("text","content","body","article"):
                    if k in data and isinstance(data[k], str):
                        return data[k]
            return json.dumps(data)
        if suf in {".jsonl", ".ndjson"}:
            lines = []
            with path.open("r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line=line.strip()
                    if not line: continue
                    try:
                        obj=json.loads(line)
                        if isinstance(obj, dict):
                            for k in ("text","content","body","article"):
                                if k in obj and isinstance(obj[k], str): lines.append(obj[k]); break
                            else: lines.append(json.dumps(obj))
                        else:
                            lines.append(str(obj))
                    except Exception:
                        lines.append(line)
            return "\n".join(lines)
    except Exception as e:
        return f"ERROR_READING_FILE {path.name}: {e}"
    return ""

def load_documents(root_dir: str):
    exts = (".txt", ".md", ".json", ".jsonl", ".ndjson")
    paths = [p for p in Path(root_dir).rglob("*") if p.suffix.lower() in exts and p.is_file()]
    if SHUFFLE_DOCS: random.shuffle(paths)
    docs = []
    for p in paths:
        if MAX_DOCS and len(docs) >= MAX_DOCS: break
        txt = read_text_from_file(p).strip()
        if not txt: continue
        docs.append({"id": f"doc_{len(docs)}", "text": txt, "source_path": str(p)})
    return docs

async def index_documents():
    from tqdm.auto import tqdm
    print("Loading documents...")
    t0 = time.perf_counter(); docs = load_documents(DATASET_DIR)
    print(f"Loaded {len(docs)} docs in {time.perf_counter()-t0:.2f}s")
    if not docs: return
    start_mem = memory_mb()
    if start_mem is not None: print(f"Start RSS: {start_mem:.1f} MB")

    # Access batch manager to monitor queue len
    from minirag.llm.openai import init_openai_batch_manager
    mgr = init_openai_batch_manager()

    in_flight: set[asyncio.Task] = set()
    total_started = 0
    last_status = time.time()

    async def spawn(doc):
        nonlocal total_started
        t = asyncio.create_task(rag.ainsert(doc['text'], metadata={"id": doc['id'], "source": doc['source_path']}, file_path=doc['source_path']))
        in_flight.add(t)
        t.add_done_callback(lambda f: in_flight.discard(t))
        total_started += 1

    pbar = tqdm(total=len(docs), desc="Ingest", unit="doc")
    for doc in docs:
        # Backpressure on number of in-flight tasks
        while len(in_flight) >= IN_FLIGHT_LIMIT:
            done, _ = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
            pbar.update(len(done))
        await spawn(doc)
        # Flush condition: queued pending requests beyond threshold
        qlen = getattr(mgr, 'queue_len', 0)
        if qlen >= TARGET_BATCH_QUEUE:
            print(f"[Trigger Flush] pending_queue={qlen} >= {TARGET_BATCH_QUEUE}")
            await mgr.flush()  # prints batch submit inside
        # Periodic status
        if time.time() - last_status >= STATUS_EVERY:
            snap = mgr.status_snapshot()
            print(f"[Status] queue={snap['queue_len']} total_batches={snap.get('total_batches')} submitted_reqs={snap.get('total_submitted_reqs')}")
            last_status = time.time()
    # Drain remaining tasks
    while in_flight:
        done, _ = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
        pbar.update(len(done))
    pbar.close()

    if FINAL_FLUSH and getattr(mgr, 'queue_len', 0):
        print(f"[Final Flush] remaining_queue={mgr.queue_len}")
        await mgr.flush()

    # Wait for any outstanding batch polling completions
    # Poll until all futures settled (approx): check active batches statuses
    pending_batches = True
    while pending_batches:
        snap = mgr.status_snapshot()
        active = [s for s in snap['batches'].values() if s not in ('completed','failed','cancelled','expired')]
        if not active:
            break
        print(f"[Await Batches] active={len(active)} snapshot={snap['batches']}")
        await asyncio.sleep(15)

    dur = time.perf_counter()-t0
    print(f"Inserted {len(docs)} docs in {dur:.2f}s")
    end_mem = memory_mb()
    if end_mem is not None: print(f"End RSS: {end_mem:.1f} MB (Δ {end_mem - start_mem:.1f} MB)")

await index_documents()
print("Indexing complete. Proceed to Cell 2 for querying & evaluation.")

Loading documents...
Loaded 442 docs in 12.25s
Start RSS: 884.8 MB


Ingest:   0%|          | 0/442 [00:00<?, ?doc/s]

[Batch Submit] id=batch_68ad8833b80c819089247b550e5877be size=16 total_batches=1 total_reqs=16
[Batch Submit] id=batch_68ad8917847881908143bf12a03e06f5 size=16 total_batches=2 total_reqs=32
⠼ Processed 4 chunks, 36 entities(duplicated), 27 relations(duplicated)d)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[Status] queue=16 total_batches=2 submitted_reqs=32
[Batch Submit] id=batch_68ad89cc73dc81909f021552f607a79a size=16 total_batches=3 total_reqs=48
[Batch Submit] id=batch_68ad8ab2ed988190818935c5c3e81e5e size=16 total_batches=4 total_reqs=64
[Batch Submit] id=batch_68ad8b841b0c8190be79facf40c82c57 size=16 total_batches=5 total_reqs=80
[Batch Submit] id=batch_68ad8c3c4e008190aca8ff2f550388b2 size=16 total_batches=6 total_reqs=96
[Status] queue=16 total_batches=6 submitted_reqs=96elations(duplicated)ed)
[Batch Submit] id=batch_68ad8d0b7f2c8190bd69f558fa811689 size=16 total_batches=7 total_reqs=112
[Batch Submit] id=batch_68ad8f289e108190971460c47ffc3572 size=16 total_batches=8 total_reqs=128
[Batch Submit] id=batch_68ad9542ce888190b30b69fb10e7fca3 size=16 total_batches=9 total_reqs=144
[Batch Submit] id=batch_68ad9fafcc84819097cbc3e3fb3e4123 size=16 total_batches=10 total_reqs=160
[Batch Submit] id=batch_68ada063d06081909e2ed7300b1039aa size=16 total_batches=11 total_reqs=176
[Batch Subm

CancelledError: 

In [None]:
# Cell 2: Query & QA Evaluation
# ----------------------------------------------
# Run AFTER Cell 1. Uses the global `rag` object and indexed data.
# Supports:
#  - Loading LiHua-World QA pairs from query_set.csv
#  - Evaluating answer quality with simple + lexical + semantic metrics
#  - Optional CSV logging

import os, csv, time, json, random, re, statistics, asyncio, math
from pathlib import Path
# from minirag import QueryParam
from minirag import QueryParam
from minirag.utils import calculate_similarity  # legacy helper (returns indices) – not used now

# Extra metric libs (lazy loads handled in compute_similarity)
from nltk.metrics import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from rouge import Rouge
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

# -------- Configuration --------
QA_CSV_PATH = r"C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\dataset\LiHua-World\qa\query_set.csv"
OUTPUT_CSV_PATH = r"C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\notebooks"  # set to None to skip saving
QUERY_MODE = "naive"      # mini | light | naive | doc | meta | bm25
TOP_K = 5
MAX_Q = None             # limit question count
RANDOM_SEED = 42
USE_BERT_SIM = True       # toggle semantic similarity (slower)
EVAL_CONCURRENCY = 12      # max concurrent rag.aquery calls
random.seed(RANDOM_SEED)

# -------- Metrics Helpers --------
TOKEN_SPLIT_RE = re.compile(r"\W+", re.UNICODE)

# lazy globals
_ROUGE = None
_BERT_MODEL = None
_SMOOTH_FN = SmoothingFunction().method1


def _lazy_rouge():
    global _ROUGE
    if _ROUGE is None:
        _ROUGE = Rouge()
    return _ROUGE


def _lazy_bert():
    global _BERT_MODEL
    if _BERT_MODEL is None:
        _BERT_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
    return _BERT_MODEL


def normalize_text(s: str) -> str:
    return TOKEN_SPLIT_RE.sub(" ", s.lower()).strip()


def token_set(s: str) -> set[str]:
    return {t for t in normalize_text(s).split() if t}

_BERT_MODEL = None
_ROUGE = None
_SMOOTH = SmoothingFunction().method1

def calculate_best_similarity(sentences: list[str], target: str, method="levenshtein", n=1):
    """
    Returns the highest similarity score (float) between any sentence in `sentences` and `target`.
    Methods: jaccard | levenshtein | rouge | bert | overlap | bleu
    For rouge, n=1 or 2 selects rouge-1 or rouge-2 F.
    """
    if not sentences:
        return 0.0
    tgt_tokens = target.lower().split()
    scores = []

    if method == "jaccard":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_tokens = set(s.lower().split())
            inter = set(s_tokens).intersection(set(tgt_set))
            union = set(s_tokens).union(set(tgt_set))
            scores.append(len(inter) / len(union) if union else 0.0)

    elif method == "levenshtein":
        tgt_len = max(len(tgt_tokens), 1)
        for s in sentences:
            dist = edit_distance(tgt_tokens, s.lower().split())
            norm = max(tgt_len, len(s.split()))
            scores.append(1 - dist / norm if norm else 0.0)

    elif method == "rouge":
        global _ROUGE
        if _ROUGE is None:
            _ROUGE = Rouge()
        key = f"rouge-{n}"
        for s in sentences:
            r = _ROUGE.get_scores(s, target)
            scores.append(r[0].get(key, {}).get("f", 0.0))

    elif method == "bert":
        global _BERT_MODEL
        if _BERT_MODEL is None:
            _BERT_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = _BERT_MODEL.encode(sentences + [target], show_progress_bar=False)
        tgt_vec = embeddings[-1]
        tgt_norm = np.linalg.norm(tgt_vec)
        for i in range(len(sentences)):
            v = embeddings[i]
            denom = (np.linalg.norm(v) * tgt_norm)
            scores.append(float(np.dot(v, tgt_vec) / denom) if denom else 0.0)

    elif method == "overlap":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_set = set(s.lower().split())
            inter = s_set & tgt_set
            denom = min(len(s_set), len(tgt_set))
            scores.append(len(inter) / denom if denom else 0.0)

    elif method == "bleu":
        tgt_bleu = word_tokenize(target.lower())
        for s in sentences:
            s_bleu = word_tokenize(s.lower())
            scores.append(sentence_bleu([tgt_bleu], s_bleu, smoothing_function=_SMOOTH))
    else:
        raise ValueError("Unsupported method.")

    return max(scores) if scores else 0.0

def compute_similarity(answer: str, gold: str, use_bert: bool = True) -> dict:
    """Compute a bundle of similarity scores between answer and gold.

    Returns keys:
      jaccard, levenshtein, rouge1_f, rouge2_f, overlap, bleu, bert_cos (optional)
    """
    sentences = sent_tokenize(answer)
    jaccard = calculate_best_similarity(sentences, gold, method="jaccard")
    levenshtein = calculate_best_similarity(sentences, gold, method="levenshtein")
    rouge1_f = calculate_best_similarity(sentences, gold, method="rouge", n=1)
    rouge2_f = calculate_best_similarity(sentences, gold, method="rouge", n=2)
    overlap = calculate_best_similarity(sentences, gold, method="overlap")
    bleu = calculate_best_similarity(sentences, gold, method="bleu")
    bert_cos = calculate_best_similarity(sentences, gold, method="bert") if use_bert else None

    result = {
        'jaccard': jaccard,
        'levenshtein': levenshtein,
        'rouge1_f': rouge1_f,
        'rouge2_f': rouge2_f,
        'overlap': overlap,
        'bleu': bleu,
    }
    if bert_cos is not None:
        result['bert_cos'] = bert_cos
    return result


def compute_metrics(answer: str, gold: str) -> dict:
    # Basic lexical metrics
    a_norm, g_norm = normalize_text(answer), normalize_text(gold)
    exact = bool(g_norm) and a_norm == g_norm
    substring = bool(g_norm) and g_norm in a_norm
    ts_a, ts_g = token_set(answer), token_set(gold)
    token_recall = (len(ts_a & ts_g) / len(ts_g)) if ts_g else 0.0

    sim_bundle = compute_similarity(answer, gold, use_bert=USE_BERT_SIM)

    return {
        'exact': exact,
        'substring': substring,
        'token_recall': token_recall,
        **sim_bundle,
    }

# -------- Load QA Pairs --------
qa_pairs = []
if os.path.exists(QA_CSV_PATH):
    with open(QA_CSV_PATH, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if "Question" in row and "Gold Answer" in row:
                qa_pairs.append((row["Question"].strip(), row["Gold Answer"].strip()))
else:
    print("QA CSV not found. Provide QA_CSV_PATH or create synthetic pairs manually.")

if MAX_Q:
    qa_pairs = qa_pairs[:MAX_Q]

print(f"Loaded {len(qa_pairs)} QA pairs.")
if not qa_pairs:
    raise SystemExit("No QA data available.")

assert 'rag' in globals(), "rag not found. Run Cell 1 first."

# -------- Evaluation (Concurrency-Limited) --------
from asyncio import Semaphore, create_task, as_completed

async def run_eval(mode, n, concurrency=EVAL_CONCURRENCY):
    qp = QueryParam(mode=mode, top_k=TOP_K)
    sem = Semaphore(concurrency)
    rows = []
    latencies = []

    async def one(i, question, gold):
        async with sem:
            t0 = time.perf_counter()
            try:
                answer = await rag.aquery(question, param=qp)
            except TypeError:
                answer = await rag.aquery(question)
            latency = time.perf_counter() - t0
            m = compute_metrics(answer, gold)
            return {
                "i": i,
                "question": question,
                "gold": gold,
                "answer": answer,
                "latency_s": latency,
                **m
            }

    tasks = [create_task(one(i, q, g)) for i, (q, g) in enumerate(qa_pairs, start=1)]
    pbar = tqdm(total=len(tasks), desc=f"Eval-{mode}", unit="q")

    for fut in as_completed(tasks):
        try:
            row = await fut
        except Exception as e:
            # Capture exceptions as a row with error info
            row = {"i": -1, "question": None, "gold": None, "answer": f"ERROR: {e}", "latency_s": 0.0}
        rows.append(row)
        if 'latency_s' in row:
            latencies.append(row.get("latency_s", 0.0))
        pbar.update(1)
    pbar.close()

    # Remove errored placeholder rows from metrics
    rows_ok = [r for r in rows if r.get('i', -1) != -1]
    if not rows_ok:
        print("All queries failed.")
        return rows

    rows_ok.sort(key=lambda r: r["i"])  # original order

    def _avg(key):
        vals = [r[key] for r in rows_ok if isinstance(r.get(key), (int, float))]
        return sum(vals)/len(vals) if vals else 0.0

    exact_rate = _avg('exact')
    substr_rate = _avg('substring')
    avg_token_recall = _avg('token_recall')
    avg_lat = sum(latencies)/len(latencies) if latencies else 0.0
    p95_lat = (sorted(latencies)[int(len(latencies)*0.95)-1] if len(latencies) > 1 else (latencies[0] if latencies else 0.0))

    print(f"\nAggregate: exact={exact_rate:.2%} substring={substr_rate:.2%} token_recall={avg_token_recall:.2%}")
    for mkey in ['jaccard','levenshtein','rouge1_f','rouge2_f','overlap','bleu','bert_cos']:
        if rows_ok and mkey in rows_ok[0]:
            print(f"  {mkey}: {_avg(mkey):.3f}")
    print(f"Latency: avg={avg_lat*1000:.1f} ms p95={p95_lat*1000:.1f} ms (concurrency={concurrency})")

    if OUTPUT_CSV_PATH and rows_ok:
        os.makedirs(OUTPUT_CSV_PATH, exist_ok=True)
        OUTPUT_CSV = os.path.join(OUTPUT_CSV_PATH, f"results_{mode}{n}.csv")
        write_header = not os.path.exists(OUTPUT_CSV)
        with open(OUTPUT_CSV, 'a', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=list(rows_ok[0].keys()))
            if write_header:
                writer.writeheader()
            writer.writerows(rows_ok)
        print(f"Saved results to {OUTPUT_CSV}")
    return rows_ok

# Run evaluation (adjust EVAL_CONCURRENCY above as needed)
eval_results1 = await run_eval("light", 5)
eval_results2 = await run_eval("mini", 5)
eval_results3 = await run_eval("naive", 5)
eval_results4 = await run_eval("bypass", 5)
print("Evaluation complete.")

Loaded 637 QA pairs.


Eval-mini:   0%|          | 0/637 [00:00<?, ?q/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Q1: Did Adam Smith send a message to Li Hua about the upcoming building maintenance ...
Answer: Sorry, I'm not able to provide an answer to that question.
Gold: Yes
Metrics: {'exact': '0.000', 'substring': '0.000', 'token_recall': '0.000', 'jaccard': '0.000', 'levenshtein': '0.000', 'rouge1_f': '0.000', 'rouge2_f': '0.000', 'overlap': '0.000', 'bleu': '0.000', 'bert_cos': '0.251'} Latency: 2668.8 ms
-
Q2: Did Wolfgang ask Li Hua about watching "Star Wars: A New Hope" after he asked Li...
Answer: Sorry, I'm not able to provide an answer to that question.
Gold: Yes
Metrics: {'exact': '0.000', 'substring': '0.000', 'token_recall': '0.000', 'jaccard': '0.000', 'levenshtein': '0.000', 'rouge1_f': '0.000', 'rouge2_f': '0.000', 'overlap': '0.000', 'bleu': '0.000', 'bert_cos': '0.251'} Latency: 634.9 ms
-

Aggregate: exact=0.00% substring=33.33% token_recall=33.33%
  jaccard: 0.000
  levenshtein: 0.000
  rouge1_f: 0.000
  rouge2_f: 0.000
  overlap: 0.000
  bleu: 0.003
  bert_cos: 0.208
Latency