# Evaluate DeReC Fact-Checking Verifier

DRAGON has no explicit "grounded/ungrounded" labels, so we create a synthetic evaluation dataset:
- **Grounded (0-99)**: answers via normal RAG pipeline (label=1)
- **Ungrounded (100-199)**: LLM answers WITHOUT context (label=0)

Then run DeReC verifier and measure its ability to distinguish grounded from ungrounded.

## 1. Setup

In [None]:
!git clone https://github.com/BigMak1/rag_fact_checking.git

In [None]:
!pip install -q -r rag_fact_checking/DRAGON/requirements.txt

In [None]:
import json
import random
import os
from collections import defaultdict

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"

from datasets import load_dataset, Dataset, DatasetDict
from langchain_community.llms import VLLM
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from tqdm import tqdm

# from rag_bench import baseline, data, evaluator, results
from rag_fact_checking.DRAGON.rag_bench import baseline, data, evaluator, results

In [None]:
# ── Constants ──
N_GROUNDED = 100
N_UNGROUNDED = 100

HIST_PRIVATE_QA_REPO_ID: str = "ai-forever/hist-rag-bench-private-qa"
HIST_PRIVATE_TEXTS_REPO_ID: str = "ai-forever/hist-rag-bench-private-texts"
RANDOM_SEED: int = 42
EMBEDDER_NAME: str = "ai-forever/FRIDA"
LLM_NAME: str = "bond005/meno-tiny-0.1"

DEREC_VERIFIER_REPO: str = ""  # e.g. "evilfreelancer/ruRoBERTa-DeReC-v1"
HF_TOKEN: str = ""             # set if the verifier repo is private

# ── Prompts ──
LLM_PROMPT: str = """Проанализируйте заданный контекст и ответьте на вопрос пользователя на основе сведений, предоставленных в этом контексте.
Не давайте никаких объяснений и пояснений к своему ответу. Не пишите ничего лишнего. Не извиняйтесь, не стройте диалог. Выдавайте только ответ и ничего больше.
Отвечайте на русском языке.
Если в заданном контексте нет информации для ответа на вопрос пользователя, то ничего не придумывайте и просто откажитесь отвечать.
"""

LLM_PROMPT_NO_CONTEXT: str = """Ответьте на вопрос пользователя.
Не давайте никаких объяснений. Выдавайте только ответ и ничего больше.
Отвечайте на русском языке.
"""

# ── Helpers ──
def _build_question_index(questions_ds):
    """Map str(question_id) -> {"question": ...}"""
    idx = {}
    for item in questions_ds["train"]:
        idx[str(item["id"])] = {"question": item["question"]}
    return idx


def _build_text_index(texts_ds):
    """Map doc_id -> text content"""
    idx = {}
    for item in texts_ds["train"]:
        idx[item["id"]] = item["text"]
    return idx


def get_private_qa_dataset(version):
    return load_dataset(HIST_PRIVATE_QA_REPO_ID, revision=version)


def get_private_texts_dataset(version):
    return load_dataset(HIST_PRIVATE_TEXTS_REPO_ID, revision=version)


def get_public_to_private_texts_mapping(version):
    private_texts_ds = get_private_texts_dataset(version)
    mapping = {}
    for item in private_texts_ds["train"]:
        mapping[item["public_id"]] = item["id"]
    return mapping

In [None]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

## 2. Load Data & Models

In [None]:
texts_ds, questions_ds, version = data.get_datasets(is_hist=True)
print(f"version = {version}")

In [None]:
qa_dataset = get_private_qa_dataset(version)
mapping = get_public_to_private_texts_mapping(version)

In [None]:
# Kaggle FIX
os.environ["LIBRARY_PATH"] = "/usr/local/nvidia/lib64:" + os.environ.get("LIBRARY_PATH", "")
print("LIBRARY_PATH =", os.environ["LIBRARY_PATH"])

llm = VLLM(
    model=LLM_NAME,
    tensor_parallel_size=2,
    max_new_tokens=256,
    top_p=0.95,
    temperature=0.3,
    vllm_kwargs={
        "gpu_memory_utilization": 0.45,
        "max_num_batched_tokens": 8192,
        "max_model_len": 4096,
        "disable_log_stats": True,
        "seed": RANDOM_SEED
    },
    disable_log_stats=True,
)
tok = AutoTokenizer.from_pretrained(LLM_NAME)

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDER_NAME,
    model_kwargs={"trust_remote_code": True},
    encode_kwargs={"batch_size": 16, "prompt": "search_document: "},
    query_encode_kwargs={"prompt": "search_query: "}
)

retrieval = baseline.init_retriever(
    texts_ds,
    embedding_model,
    top_k=5,
    chunk_size=500,
    chunk_overlap=100,
)
generation = baseline.init_generation(retrieval, llm, tok, system_prompt=LLM_PROMPT)

## 3. Grounded Answers (questions 0-99)

In [None]:
%%time
%%capture

res_grounded = baseline.get_results(
    generation, questions_ds, take=N_GROUNDED, write_logs=False
)
print(f"Grounded answers: {len(res_grounded)}")

## 4. Ungrounded Answers (questions 100-199)

For each question:
1. Run retriever to get documents (needed for DeReC evidence)
2. Generate answer with LLM **without context** — purely from parametric knowledge

In [None]:
%%time

# Build no-context prompt template
messages_no_ctx = [
    {"role": "system", "content": LLM_PROMPT_NO_CONTEXT},
    {"role": "user", "content": "Вопрос: {question}"},
]
template_no_ctx = tok.apply_chat_template(
    messages_no_ctx, tokenize=False, add_generation_prompt=True
)

res_ungrounded = {}
ungrounded_slice = questions_ds["train"].select(range(N_GROUNDED, N_GROUNDED + N_UNGROUNDED))

for item in tqdm(ungrounded_slice, desc="Ungrounded"):
    # Retrieve documents (for DeReC evidence later)
    try:
        docs = retrieval.invoke(item["question"])
    except AttributeError:
        docs = retrieval.get_relevant_documents(item["question"])

    # Generate answer WITHOUT context
    prompt = template_no_ctx.replace("{question}", item["question"])
    answer = llm.invoke(prompt)

    res_ungrounded[item["id"]] = {
        "found_ids": [d.metadata["id"] for d in docs],
        "model_answer": answer,
    }

print(f"Ungrounded answers: {len(res_ungrounded)}")

## 5. Combine Results + Labels

In [None]:
res = {**res_grounded, **res_ungrounded}
assert len(res) == N_GROUNDED + N_UNGROUNDED, f"Expected {N_GROUNDED + N_UNGROUNDED}, got {len(res)}"

labels = {}
for qid in res_grounded:
    labels[qid] = True   # grounded
for qid in res_ungrounded:
    labels[qid] = False  # ungrounded

results.save(res, "./fact_check_eval_results.json")
print(f"Saved {len(res)} results. Grounded: {sum(labels.values())}, Ungrounded: {sum(not v for v in labels.values())}")

## 6. DRAGON Metrics by Group

In [None]:
grounded_ids = set(res_grounded.keys())
ungrounded_ids = set(res_ungrounded.keys())

# Filter private qa_dataset to the relevant public_ids for each group
qa_grounded = qa_dataset["train"].filter(lambda x: x["public_id"] in grounded_ids)
qa_ungrounded = qa_dataset["train"].filter(lambda x: x["public_id"] in ungrounded_ids)
qa_grounded_dd = DatasetDict({"train": qa_grounded})
qa_ungrounded_dd = DatasetDict({"train": qa_ungrounded})

print(f"=== GROUNDED (n={len(qa_grounded)}) ===")
eval_grounded = evaluator.evaluate_rag_results(res_grounded, qa_grounded_dd, mapping)
_ = eval_grounded.to_table(overall_only=True)

print(f"\n=== UNGROUNDED (n={len(qa_ungrounded)}) ===")
eval_ungrounded = evaluator.evaluate_rag_results(res_ungrounded, qa_ungrounded_dd, mapping)
_ = eval_ungrounded.to_table(overall_only=True)

## 7. DeReC Scoring

In [None]:
# Free GPU memory from LLM before loading verifier
del llm
torch.cuda.empty_cache()
print("LLM removed, GPU memory freed.")

In [None]:
assert DEREC_VERIFIER_REPO.strip(), "Set DEREC_VERIFIER_REPO in the constants cell first!"

hf_token = HF_TOKEN.strip() or None
verifier_tok = AutoTokenizer.from_pretrained(DEREC_VERIFIER_REPO, token=hf_token)
verifier = AutoModelForSequenceClassification.from_pretrained(DEREC_VERIFIER_REPO, token=hf_token)
verifier.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
verifier.to(device)

q_idx = _build_question_index(questions_ds)
t_idx = _build_text_index(texts_ds)

# Score every result
derec_scores = {}

with torch.no_grad():
    for qid, pred in tqdm(res.items(), desc="DeReC scoring"):
        question = q_idx.get(str(qid), {}).get("question", "")
        answer = pred.get("model_answer", "")
        found_ids = pred.get("found_ids", [])
        evidence = "\n".join([t_idx.get(doc_id, "") for doc_id in found_ids])[:4000]

        text = f"claim: question: {question}\nanswer: {answer} [SEP] evidence: {evidence}"
        inp = verifier_tok(
            text, max_length=512, padding="max_length",
            truncation=True, return_tensors="pt"
        )
        inp = {k: v.to(device) for k, v in inp.items()}
        probs = torch.softmax(verifier(**inp).logits, dim=-1)[0].cpu().numpy()
        true_score = float(probs[2] if probs.shape[0] >= 3 else probs.max())
        derec_scores[qid] = true_score

# Statistics by group
scores_grounded = [derec_scores[qid] for qid in grounded_ids]
scores_ungrounded = [derec_scores[qid] for qid in ungrounded_ids]

print(f"\nGrounded   — mean: {np.mean(scores_grounded):.4f}, median: {np.median(scores_grounded):.4f}, std: {np.std(scores_grounded):.4f}")
print(f"Ungrounded — mean: {np.mean(scores_ungrounded):.4f}, median: {np.median(scores_ungrounded):.4f}, std: {np.std(scores_ungrounded):.4f}")

## 8. Classification Metrics

In [None]:
# Build arrays aligned by qid
qids = sorted(res.keys())
y_true = np.array([1 if labels[qid] else 0 for qid in qids])
y_scores = np.array([derec_scores[qid] for qid in qids])

# ROC-AUC
auc = roc_auc_score(y_true, y_scores)
fpr, tpr, thresholds = roc_curve(y_true, y_scores)

# Optimal threshold (Youden's J)
j_scores = tpr - fpr
best_idx = np.argmax(j_scores)
best_threshold = thresholds[best_idx]
best_tpr = tpr[best_idx]
best_fpr = fpr[best_idx]

print(f"ROC-AUC: {auc:.4f}")
print(f"Optimal threshold (Youden's J): {best_threshold:.4f}")
print(f"  TPR: {best_tpr:.4f}, FPR: {best_fpr:.4f}")

# Metrics at several thresholds
print("\n" + "="*65)
print(f"{'Threshold':>10} {'TPR':>8} {'FPR':>8} {'Precision':>10} {'F1':>8}")
print("="*65)
for t in [0.3, 0.4, 0.5, best_threshold, 0.6, 0.7, 0.8]:
    y_pred = (y_scores >= t).astype(int)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    tpr_t = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr_t = fp / (fp + (y_true == 0).sum()) if (y_true == 0).sum() > 0 else 0
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0
    f1 = 2 * prec * tpr_t / (prec + tpr_t) if (prec + tpr_t) > 0 else 0
    marker = " *" if abs(t - best_threshold) < 1e-6 else ""
    print(f"{t:>10.4f} {tpr_t:>8.4f} {fpr_t:>8.4f} {prec:>10.4f} {f1:>8.4f}{marker}")
print("="*65)
print("* = optimal (Youden's J)")

## 9. Visualizations

In [None]:
# Histogram: DeReC score distribution for grounded vs ungrounded
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(scores_grounded, bins=25, alpha=0.6, label="Grounded", color="steelblue")
ax.hist(scores_ungrounded, bins=25, alpha=0.6, label="Ungrounded", color="tomato")
ax.axvline(best_threshold, color="black", linestyle="--", label=f"Threshold={best_threshold:.3f}")
ax.set_xlabel("DeReC true_score")
ax.set_ylabel("Count")
ax.set_title("DeReC Score Distribution: Grounded vs Ungrounded")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# ROC Curve
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(fpr, tpr, color="steelblue", lw=2, label=f"ROC (AUC={auc:.3f})")
ax.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1, label="Random")
ax.scatter([best_fpr], [best_tpr], color="red", s=100, zorder=5,
           label=f"Optimal (t={best_threshold:.3f})")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curve — DeReC as Groundedness Detector")
ax.legend(loc="lower right")
ax.set_xlim(-0.02, 1.02)
ax.set_ylim(-0.02, 1.02)
plt.tight_layout()
plt.show()

In [None]:
# Scatter: ROUGE-L vs DeReC score for ungrounded answers
# Shows parametric knowledge cases: high ROUGE-L + should be low DeReC
rouge_l_ungrounded = []
derec_ungrounded = []

for qid in ungrounded_ids:
    public_id_str = str(qid)
    if public_id_str in eval_ungrounded.individual_results:
        rouge_l = eval_ungrounded.individual_results[public_id_str]["generation"]["rougeL"]
    elif qid in eval_ungrounded.individual_results:
        rouge_l = eval_ungrounded.individual_results[qid]["generation"]["rougeL"]
    else:
        continue
    rouge_l_ungrounded.append(rouge_l)
    derec_ungrounded.append(derec_scores[qid])

fig, ax = plt.subplots(figsize=(9, 6))
sc = ax.scatter(rouge_l_ungrounded, derec_ungrounded, alpha=0.6, c="tomato", edgecolors="gray", s=40)
ax.axhline(best_threshold, color="black", linestyle="--", alpha=0.7, label=f"DeReC threshold={best_threshold:.3f}")
ax.set_xlabel("ROUGE-L (ungrounded vs reference)")
ax.set_ylabel("DeReC true_score")
ax.set_title("Ungrounded: ROUGE-L vs DeReC Score\n(top-right = parametric knowledge)")
ax.legend()
plt.tight_layout()
plt.show()

# Count parametric knowledge cases
n_high_rouge = sum(1 for r in rouge_l_ungrounded if r > 0.5)
n_high_rouge_high_derec = sum(
    1 for r, d in zip(rouge_l_ungrounded, derec_ungrounded) if r > 0.5 and d > best_threshold
)
print(f"Ungrounded with ROUGE-L > 0.5 (parametric knowledge): {n_high_rouge}/{len(rouge_l_ungrounded)}")
print(f"  of which DeReC > threshold (false positives): {n_high_rouge_high_derec}")