## Config

**If you are using Colab. Please run the code under this text.**

In [None]:
!pip install datasets transformers sentence-transformers rank-bm25 scikit-learn nltk evaluate bert-score tqdm rouge_score bertviz



In [None]:
import random, numpy as np, torch
import sys, subprocess
from dataclasses import dataclass
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import numpy as np
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
import torch.nn.functional as F

In [None]:
@dataclass
class CFG:
    dataset_name: str = "hotpot_qa"
    dataset_config: str = "distractor"
    split: str = "validation"
    max_examples: int = 100
    retriever: str = "dense"              # "tfidf" | "bm25" | "dense"
    retriever_k: int = 5
    generator_model: str = "google/flan-t5-small"
    max_input_tokens: int = 1024
    max_new_tokens: int = 64
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42

cfg = CFG()
random.seed(cfg.seed); np.random.seed(cfg.seed); torch.manual_seed(cfg.seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(cfg.seed)
cfg


CFG(dataset_name='hotpot_qa', dataset_config='distractor', split='validation', max_examples=100, retriever='dense', retriever_k=5, generator_model='google/flan-t5-small', max_input_tokens=1024, max_new_tokens=64, device='cuda', seed=42)

## Data — HotpotQA to corpus

In [None]:
ds = load_dataset(cfg.dataset_name, cfg.dataset_config)
split = ds[cfg.split]

def build_corpus_and_gold(ex):
    docs, titles = [], []

    for title, sents in zip(ex["context"]["title"], ex["context"]["sentences"]):
        text = " ".join(sents)
        titles.append(title)
        docs.append(f"{title} :: {text}")

    gold = sorted(list(set(ex["supporting_facts"]["title"])))
    return docs, titles, gold

idxs = list(range(len(split)))
random.shuffle(idxs)
idxs = idxs[:cfg.max_examples]

examples = []
corpus, titles_all = [], []
for i in idxs:
    ex = split[i]
    docs, titles, gold = build_corpus_and_gold(ex)
    examples.append({"id": ex["id"], "question": ex["question"], "answer": ex["answer"],
                     "gold_titles": gold, "titles": titles, "contexts": docs})
    corpus.extend(docs); titles_all.extend(titles)

seen = set(); corpus_unique = []; title_unique = []
for d,t in zip(corpus, titles_all):
    if t in seen: continue
    seen.add(t); corpus_unique.append(d); title_unique.append(t)

len(examples), len(corpus_unique)


(100, 991)

## Retrievers

In [None]:
def tok(x): return x.lower().split()

tfidf_vec = None; X = None
bm25 = None
dense_model = None; dense_index = None

if cfg.retriever == "tfidf":
    tfidf_vec = TfidfVectorizer(max_features=100_000, ngram_range=(1,2))
    X = tfidf_vec.fit_transform(corpus_unique)
elif cfg.retriever == "bm25":
    tokenized = [tok(x) for x in corpus_unique]
    bm25 = BM25Okapi(tokenized)
elif cfg.retriever == "dense":
    dense_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=cfg.device)
    dense_index = dense_model.encode(corpus_unique, batch_size=128, convert_to_numpy=True, show_progress_bar=True)
else:
    raise ValueError("Unknown retriever")


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
def retrieve(query, k=None):
    k = k or cfg.retriever_k
    if cfg.retriever == "tfidf":
        q = tfidf_vec.transform([query])
        scores = (q @ X.T).toarray().ravel()
        idx = np.argsort(-scores)[:k]
        return [(int(i), float(scores[i])) for i in idx]
    elif cfg.retriever == "bm25":
        scores = bm25.get_scores(tok(query))
        idx = np.argsort(-scores)[:k]
        return [(int(i), float(scores[i])) for i in idx]
    else:
        qv = dense_model.encode([query], convert_to_numpy=True)
        sims = (qv @ dense_index.T).ravel()
        idx = np.argsort(-sims)[:k]
        return [(int(i), float(sims[i])) for i in idx]


## Generator

In [None]:
set_seed(cfg.seed)
gen_tok = AutoTokenizer.from_pretrained(cfg.generator_model)
gen = AutoModelForSeq2SeqLM.from_pretrained(cfg.generator_model).to(cfg.device)

def build_prompt(question, contexts):
    ctx_str = "\\n".join(contexts)
    return f"Answer the following question using *only* the context provided below.\\n\\nContext:\\n{ctx_str}\\n\\nQuestion: {question}\\nAnswer:"

@torch.no_grad()
def generate_answer(question, ctx_docs):
    inputs = gen_tok(build_prompt(question, ctx_docs), return_tensors="pt",
                     truncation=True, max_length=cfg.max_input_tokens).to(cfg.device)

    out = gen.generate(
        **inputs,
        max_new_tokens=cfg.max_new_tokens,
        output_scores=True,
        return_dict_in_generate=True
    )


    seq = out.sequences[0]
    scores = out.scores

    nll = 0.0
    num_tokens = 0

    for t in range(1, len(scores)):
        token_id = seq[t]

        if token_id == gen_tok.pad_token_id or token_id == gen_tok.eos_token_id:
            break

        step_logits = scores[t-1]
        step_log_probs = F.log_softmax(step_logits, dim=-1)
        token_log_prob = step_log_probs[0, token_id].item()

        nll -= token_log_prob
        num_tokens += 1

    uncertainty_score = nll / max(1, num_tokens)

    decoded_text = gen_tok.decode(seq, skip_special_tokens=True)

    return decoded_text, uncertainty_score


## Retrieval evaluation: Precision@k / Recall@k

In [None]:
def precision_recall_at_k(example, retrieved, k=None):
    k = k or cfg.retriever_k
    retrieved_titles = [title_unique[i] for i,_ in retrieved[:k]]
    gold = set(example["gold_titles"])
    hit = sum(t in gold for t in retrieved_titles)
    Pk = hit / k if k>0 else 0.0
    Rk = hit / len(gold) if len(gold)>0 else 0.0
    return Pk, Rk

def eval_retrieval(examples, k=None):
    k = k or cfg.retriever_k
    P, R = [], []
    for ex in examples:
        retrieved = retrieve(ex["question"], k=k)
        pk, rk = precision_recall_at_k(ex, retrieved, k=k)
        P.append(pk); R.append(rk)
    return float(np.mean(P)), float(np.mean(R))

Pk, Rk = eval_retrieval(examples, k=cfg.retriever_k)
print({"retriever": cfg.retriever, f"P@{cfg.retriever_k}": round(Pk,3), f"R@{cfg.retriever_k}": round(Rk,3)})


{'retriever': 'dense', 'P@5': 0.308, 'R@5': 0.77}


## Generation evaluation: BLEU / ROUGE‑L / BERTScore

In [None]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def eval_generation(examples, k=None, limit=50):
    k = k or cfg.retriever_k
    preds, refs = [], []
    for i, ex in enumerate(tqdm(examples)):
        if i >= limit: break
        retrieved = retrieve(ex["question"], k=k)
        ctx_docs = [corpus_unique[idx] for idx,_ in retrieved]
        pred, uncertainty = generate_answer(ex["question"], ctx_docs)

        preds.append(pred); refs.append(ex["answer"])

    m_bleu = bleu.compute(predictions=preds, references=[[r] for r in refs])
    m_rouge = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    m_bert = bertscore.compute(predictions=preds, references=refs, lang="en")
    return {
        "BLEU": round(m_bleu["bleu"], 4),
        "ROUGE_L": round(m_rouge["rougeL"], 4),
        "BERTScore_P": float(np.mean(m_bert["precision"])),
        "BERTScore_R": float(np.mean(m_bert["recall"])),
        "BERTScore_F1": float(np.mean(m_bert["f1"])),
    }, preds, refs

gen_metrics, gen_preds, gen_refs = eval_generation(examples, k=cfg.retriever_k, limit=cfg.max_examples)
gen_metrics


100%|██████████| 100/100 [00:30<00:00,  3.25it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': 0.1304,
 'ROUGE_L': np.float64(0.3743),
 'BERTScore_P': 0.9074827939271927,
 'BERTScore_R': 0.9031340926885605,
 'BERTScore_F1': 0.9044328409433365}

## Joint summary

In [None]:
summary = {
    "retriever": cfg.retriever,
    "generator": cfg.generator_model,
    "k": cfg.retriever_k,
    "Precision@k": round(Pk,3),
    "Recall@k": round(Rk,3),
    **gen_metrics
}
summary


{'retriever': 'dense',
 'generator': 'google/flan-t5-small',
 'k': 5,
 'Precision@k': 0.308,
 'Recall@k': 0.77,
 'BLEU': 0.1304,
 'ROUGE_L': np.float64(0.3743),
 'BERTScore_P': 0.9074827939271927,
 'BERTScore_R': 0.9031340926885605,
 'BERTScore_F1': 0.9044328409433365}

## Qualitative examples

In [None]:
def show_examples(n=5):
    shown = 0
    for ex in examples:
        if shown >= n: break
        retrieved = retrieve(ex["question"], k=cfg.retriever_k)
        ctx_titles = [title_unique[i] for i,_ in retrieved]
        ctx_docs = [corpus_unique[i] for i,_ in retrieved]

        pred, uncertainty = generate_answer(ex["question"], ctx_docs)


        gold_answer_normalized = ex["answer"].lower()
        context_normalized = " ".join(ctx_docs).lower()

        faithful = gold_answer_normalized in context_normalized

        print("="*80)
        print("Q:", ex["question"])
        print("Gold titles:", ex["gold_titles"])
        print("Retrieved:", ctx_titles)
        print("Pred:", pred)
        print("Ref:", ex["answer"])

        print(f"Uncertainty (Avg NLL): {uncertainty:.4f} (Low = More Confident)")

        if faithful:
            print("Label:", "Faithful (Context contains the answer)")
        else:
            print("Label:", "Hallucinated (Answer NOT in context)")

        shown += 1

show_examples(10)

Q: What was Iqbal F. Qadir on when he participated in an attack on a radar station located on western shore of the Okhamandal Peninsula?
Gold titles: ['Dwarka', 'Iqbal F. Qadir']
Retrieved: ['Iqbal F. Qadir', 'Mukachevo Radar Station', 'Sevastopol Radar Station', 'Radar Station B-71', 'No. 227 Radar Station RAAF']
Pred: a flotilla
Ref: flotilla
Uncertainty (Avg NLL): 0.6118 (Low = More Confident)
Label: Faithful (Context contains the answer)
Q: When did the park at which Tivolis Koncertsal is located open?
Gold titles: ['Tivoli Gardens', 'Tivolis Koncertsal']
Retrieved: ['Tivolis Koncertsal', 'Tivoli Gardens', 'Tivoli Two', 'Tivoli One', 'Battle of Ticinus']
Pred: 15 August 1843
Ref: 15 August 1843
Uncertainty (Avg NLL): 0.0051 (Low = More Confident)
Label: Faithful (Context contains the answer)
Q: What is the shared country of ancestry between Art Laboe and Scout Tufankjian?
Gold titles: ['Art Laboe', 'Scout Tufankjian']
Retrieved: ['Scout Tufankjian', 'Art Laboe', 'Rajesh Roshan', 'I