In [1]:
from dotenv import load_dotenv
load_dotenv(".env")

from typing import List, Tuple

import torch
from datasets import Dataset, load_dataset
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

import sys
sys.path.append("self_reflection")

from CTRLEval.ctrleval import CTRLEval
from evaluate.loop_eval_utils import evaluate_knowledge, evaluate_response
from evaluate.sent_similarity import Sent_Similar

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [2]:
ctrleval_scorer = CTRLEval(
    iwf_dir="self_reflection/CTRLEval/iwf_full.txt",
    prompt_dir="self_reflection/CTRLEval/prompt/prompt_topic.txt",
    verbal_dir="self_reflection/CTRLEval/prompt/verbal_topic.txt",
    device="cuda",
)

# Error: Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
# Ignore because Pegasus uses static, sinusoidal position embeddings (rather than learned embeddings) for both encoder and decoder.

entailment_scorer = Sent_Similar()

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
llm = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    return_full_text=False,
    max_new_tokens=500,
)

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
  self.scaler = torch.cuda.amp.GradScaler()


In [3]:
ds = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")

RAW_KNOWLEDGE_BASE = []
for d in ds:
    for doc in d["documents"]:
        RAW_KNOWLEDGE_BASE.append(doc)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_texts(list(set(RAW_KNOWLEDGE_BASE)), embedding_model, distance_strategy=DistanceStrategy.COSINE)

In [4]:
class Args:
    no_number = False
    no_aspect = False

    max_loop = 1
    max_knowledge_loop = 1
    max_response_loop = 1
    demo_num = 0

    threshold_entailment = 0.8
    threshold_fact = -1
    threshold_consistency = -5

    temperature = 1.0
    top_p = 1
    top_k = 1
    num_beams = 1
    max_new_tokens = 128
    repetition_penalty = 1.0


args = Args()
args.max_loop = 3
args.max_knowledge_loop = 3
args.max_response_loop = 3
args.demo_num = 0
args.threshold_entailment = 0.8
args.threshold_fact = -1.0
args.threshold_consistency = -5
args.max_new_tokens = 512

In [5]:
def generate_step(
    args: Args,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    messages: List[dict[str, str]],
) -> str:
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=args.max_new_tokens,
        do_sample=args.temperature > 0,
        temperature=args.temperature,
        top_p=args.top_p,
        top_k=args.top_k,
        num_beams=args.num_beams,
        repetition_penalty=args.repetition_penalty,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0][inputs["input_ids"].size(1) :], skip_special_tokens=True)


def knowledge_loop(
    args: Args,
    question: str,
    retrieved_docs: List[str],
) -> Tuple[str, List[Tuple[int, str, float]]]:
    print("knowledge_loop")

    THRESHOLD_FACTUAL = args.threshold_fact
    MAX_KNOWLEDGE_LOOP = args.max_knowledge_loop

    candidates = []
    history = []

    retrieved_context = "\n".join(["- " + doc for doc in retrieved_docs])
    messages = [
        {
            "role": "system",
            "content": """You are an AI language model designed to provide accurate, relevant, and comprehensive background knowledge based on the given question and retrieved context, with the ability to incorporate additional context when necessary. \
Sometimes, the retrieved context may not be sufficient to answer the question accurately. In such cases, you should use your general knowledge to supplement the information from the retrieved context.""",
        },
        {
            "role": "user",
            "content": f'Provide background knowledge in one paragraph (approximately 400 words) to answer the question: "{question}", based on the following retrieved context:\n{retrieved_context}',
        },
    ]
    knowledge = generate_step(args, model, tokenizer, messages)

    loop_i = 0
    if MAX_KNOWLEDGE_LOOP > 1:
        factuality_score = evaluate_knowledge(model, args.demo_num, question, knowledge, tokenizer)
        candidates.append([factuality_score, knowledge])
        history.append([loop_i, knowledge, factuality_score])

    loop_i += 1
    while (loop_i < MAX_KNOWLEDGE_LOOP) and factuality_score < THRESHOLD_FACTUAL:
        if args.no_aspect:
            instruction = f"Please refine the knowledge."
        elif args.no_number:
            instruction = f"The knowledge is not strongly supported by empirical evidence. Please refine the knowledge to improve its factuality."
        else:
            instruction = f"The factuality score for the knowledge is {factuality_score} less than {THRESHOLD_FACTUAL}, which means the knowledge is not strongly supported by empirical evidence. Please refine the knowledge to improve its factuality."

        messages = [
            {
                "role": "system",
                "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive background knowledge based on the given question and retrieved context, with the ability to incorporate additional context when necessary.",
            },
            {
                "role": "user",
                "content": f'The provided background knowledge for the question: "{question}" is "{knowledge}", based on the following retrieved context:\n{retrieved_context}".\n\n{instruction}\nThe refined knowledge should be in one paragraph (approximately 400 words).',
            },
        ]
        knowledge = generate_step(args, model, tokenizer, messages)

        factuality_score = evaluate_knowledge(model, args.demo_num, question, knowledge, tokenizer)

        candidates.append([factuality_score, knowledge])
        history.append([loop_i, knowledge, factuality_score])
        loop_i += 1

    if (MAX_KNOWLEDGE_LOOP > 1) and factuality_score < THRESHOLD_FACTUAL:
        candidates.sort()
        return candidates[-1][-1], history
    else:
        return knowledge, history


def response_loop(
    args: Args,
    question: str,
    final_knowledge: str,
) -> Tuple[str, List[Tuple[int, str, float, float]], float]:
    print("response_loop")

    THRESHOLD_CONS = args.threshold_consistency
    MAX_RESPONSE_LOOP = args.max_response_loop

    candidates = []
    entailment_score_question_list = []
    history = []

    messages = [
        {
            "role": "system",
            "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background knowledge.",
        },
        {
            "role": "user",
            "content": f'Refer to the background knowledge: "{final_knowledge}" and answer the question: "{question}" with one paragraph.',
        },
    ]
    response = generate_step(args, model, tokenizer, messages)

    loop_i = 0
    if MAX_RESPONSE_LOOP > 1:
        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

    loop_i += 1
    while loop_i < MAX_RESPONSE_LOOP and cons_score_knowledge < THRESHOLD_CONS:
        if args.no_aspect:
            instruction = f"Please refine the response."
        elif args.no_number:
            instruction = f"The alignment and consistency between response and knowledge are low. Please refine the response to improve its consistency."
        else:
            instruction = f"The consistency score for the response is {cons_score_knowledge} less than {THRESHOLD_CONS}, which means the alignment and consistency between response and knowledge are low. Please refine the response to improve its consistency."

        messages = [
            {
                "role": "system",
                "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background knowledge.",
            },
            {
                "role": "user", 
                "content": f'The generated response for the question: "{question}" is "{response}" based on the background knowledge: "{final_knowledge}".\n\n{instruction}',
            },
        ]
        response = generate_step(args, model, tokenizer, messages)

        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

        loop_i += 1

    if MAX_RESPONSE_LOOP > 1 and cons_score_knowledge < THRESHOLD_CONS:
        merge = zip(candidates, entailment_score_question_list)
        merge = sorted(merge)
        candidates, entailment_score_question_list = zip(*merge)
        return candidates[-1][-1], history, entailment_score_question_list[-1]
    else:
        return response, history, entailment_score_question


def reflection_loop(
    args: Args,
    question: str,
    retrieved_docs: List[str],
) -> Tuple[str, str, List[Tuple[int, str, float]], List[Tuple[int, str, float, float]]]:
    all_history_knowledge, all_history_response = [], []

    THRESHOLD_ENTAIL = args.threshold_entailment
    MAX_LOOP = args.max_loop

    candidates = []
    main_loop_i = 0
    print(f"main_loop {main_loop_i}")

    final_knowledge, history_knowledge = knowledge_loop(args, question, retrieved_docs)
    all_history_knowledge += history_knowledge

    final_response, history_response, entailment_score_question = response_loop(args, question, final_knowledge)
    all_history_response += history_response
    candidates.append([entailment_score_question, final_knowledge, final_response])

    main_loop_i += 1
    while main_loop_i < MAX_LOOP and entailment_score_question < THRESHOLD_ENTAIL:
        print(f"main_loop {main_loop_i}")
        final_knowledge, history_knowledge = knowledge_loop(args, question, retrieved_docs)
        all_history_knowledge += history_knowledge

        final_response, history_response, entailment_score_question = response_loop(args, question, final_knowledge)
        all_history_response += history_response
        candidates.append([entailment_score_question, final_knowledge, final_response])
        main_loop_i += 1

    if (MAX_LOOP > 1) and entailment_score_question < THRESHOLD_ENTAIL:
        candidates.sort()
        final_knowledge, final_response = candidates[-1][1:]

    return final_knowledge, final_response, all_history_knowledge, all_history_response

In [6]:
def answer_with_rag_reflection(
    args: Args,
    question: str,
    knowledge_index: FAISS,
    use_reranker: bool = True,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[str]]:
    """RAG with self-reflection"""

    # Step 1: Retrieve documents
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # Step 2: Rerank if available
    if use_reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Step 3: Generate response with self-reflection
    print("=> Generating response with self-reflection...")
    _, final_response, _, _ = reflection_loop(args, question, relevant_docs)

    return final_response, relevant_docs

In [7]:
#response, relevant_docs = answer_with_rag_reflection(args, ds[0]['question'], KNOWLEDGE_VECTOR_DATABASE, use_reranker=True)

In [8]:
num_samples = 2
dataset = []
for d in tqdm(ds.select(range(num_samples)), total=num_samples):
    question = d["question"]
    reference = d["response"]

    final_response, relevant_docs = answer_with_rag_reflection(args, question, KNOWLEDGE_VECTOR_DATABASE, use_reranker=True)

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": final_response,
            "reference": reference,
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 18.70it/s]


=> Generating response with self-reflection...
main_loop 0
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge: The question refers to a key figure in the 2015 documentary *Out to Win*, directed by Malcolm Ingram, who attended a university before being drafted 18th overall by the New Jersey Nets. The documentary highlights LGBT participation in professional sports, featuring individuals such as Jason Collins. Among the key figures, Jason Paul Collins is directly relevant: he played college basketball at Stanford University, where he was an All-American du

100%|██████████| 2/2 [00:00<00:00, 10.00it/s]


main_loop 1
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge: The question refers to a key figure in the 2015 documentary *Out to Win*, directed by Malcolm Ingram, who attended a university before being drafted 18th overall by the New Jersey Nets. The documentary highlights LGBT participation in professional sports, featuring individuals such as Jason Collins. Among the key figures, Jason Paul Collins is directly relevant: he played college basketball at Stanford University, where he was an All-American during the 2000–01 season. After his college care

100%|██████████| 2/2 [00:00<00:00, 10.10it/s]


main_loop 2
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge: The question refers to a key figure in the 2015 documentary *Out to Win*, directed by Malcolm Ingram, who attended a university before being drafted 18th overall by the New Jersey Nets. The documentary highlights the history of LGBT participation in professional sports, featuring individuals such as Jason Collins. Among the key figures, Jason Paul Collins is directly relevant: he was an All-American basketball player at Stanford University before being drafted 18th overall in the 2001 NBA dr

100%|██████████| 2/2 [00:00<00:00, 10.80it/s]
 50%|█████     | 1/2 [15:35<15:35, 935.63s/it]

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 28.78it/s]


=> Generating response with self-reflection...
main_loop 0
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Were both Léopold Eyharts and Ulrich Walter a General in the French Air Force? 
Knowledge: Léopold Eyharts and Ulrich Walter were not both generals in the French Air Force. Léopold Eyharts, born in 1957, is a Brigadier General in the French Air Force, a military engineer, and an astronaut with the European Space Agency (ESA). His military rank and affiliation are clearly documented within the French Air Force structure. In contrast, Ulrich Hans Walter, born in 1954, is a German physicist and engineer who served as an astronaut with the German Aerospace Center (DFVLR), not the French Air Force. He had no military affiliation with France, nor is there

100%|██████████| 2/2 [00:00<00:00, 12.31it/s]
100%|██████████| 2/2 [20:50<00:00, 625.49s/it]


In [9]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )
)

E0000 00:00:1761277222.820471 2928735 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  evaluator_llm = LangchainLLMWrapper(


In [10]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

E0000 00:00:1761277223.743246 2928735 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


{'factual_correctness(mode=f1)': 0.3100, 'faithfulness': 0.8684, 'nv_context_relevance': 0.6250}


In [11]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,factual_correctness(mode=f1),faithfulness,nv_context_relevance
0,Which university did one of the key figures in...,"[Out to Win is an American documentary film, r...",None of the key figures in the 2015 documentar...,One of the key figures in the American documen...,0.0,0.736842,0.5
1,Were both Léopold Eyharts and Ulrich Walter a ...,"[Léopold Eyharts (born April 28, 1957) is a Br...","No, both Léopold Eyharts and Ulrich Walter wer...","No, only Léopold Eyharts was a General in the ...",0.62,1.0,0.75


In [12]:
import sys

sys.path.append("ragbench/ragbench")

from evaluation import calculate_metrics

In [13]:
evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

Dataset({
    features: ['user_input', 'retrieved_contexts', 'response', 'reference', 'adherence_score', 'relevance_score', 'faithfulness', 'context_relevance'],
    num_rows: 2
})

In [14]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
metrics

{'hallucination_auroc': 1.0, 'relevance_rmse': np.float64(0.4615988003900188)}