In [1]:
from dotenv import load_dotenv
load_dotenv(".env")

from typing import List, Tuple

import torch
from datasets import Dataset, load_dataset
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

import sys
sys.path.append("self_reflection")

from CTRLEval.ctrleval import CTRLEval
from evaluate.loop_eval_utils import evaluate_response
from evaluate.sent_similarity import Sent_Similar

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [2]:
ctrleval_scorer = CTRLEval(
    iwf_dir="self_reflection/CTRLEval/iwf_full.txt",
    prompt_dir="self_reflection/CTRLEval/prompt/prompt_topic.txt",
    verbal_dir="self_reflection/CTRLEval/prompt/verbal_topic.txt",
    device="cuda",
)

# Error: Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
# Ignore because Pegasus uses static, sinusoidal position embeddings (rather than learned embeddings) for both encoder and decoder.

entailment_scorer = Sent_Similar()

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
llm = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    return_full_text=False,
    max_new_tokens=500,
)

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
  self.scaler = torch.cuda.amp.GradScaler()


In [3]:
ds = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")

RAW_KNOWLEDGE_BASE = []
for d in ds:
    for doc in d["documents"]:
        RAW_KNOWLEDGE_BASE.append(doc)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_texts(list(set(RAW_KNOWLEDGE_BASE)), embedding_model, distance_strategy=DistanceStrategy.COSINE)

In [4]:
class Args:
    no_number = False
    no_aspect = False

    max_loop = 1
    max_response_loop = 1
    demo_num = 0

    threshold_entailment = 0.8
    threshold_consistency = -5

    temperature = 1.0
    top_p = 1
    top_k = 1
    num_beams = 1
    max_new_tokens = 128
    repetition_penalty = 1.0


args = Args()
args.max_loop = 3
args.max_response_loop = 3
args.demo_num = 0
args.threshold_entailment = 0.8
args.threshold_consistency = -5
args.max_new_tokens = 256

In [5]:
def generate_step(
    args: Args,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    messages: List[dict[str, str]],
) -> str:
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=args.max_new_tokens,
        do_sample=args.temperature > 0,
        temperature=args.temperature,
        top_p=args.top_p,
        top_k=args.top_k,
        num_beams=args.num_beams,
        repetition_penalty=args.repetition_penalty,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0][inputs["input_ids"].size(1) :], skip_special_tokens=True)


def response_loop(
    args: Args,
    question: str,
    final_knowledge: str,
) -> Tuple[str, List[Tuple[int, str, float, float]], float]:
    print("response_loop")

    THRESHOLD_CONS = args.threshold_consistency
    MAX_RESPONSE_LOOP = args.max_response_loop

    candidates = []
    entailment_score_question_list = []
    history = []

    messages = [
        {
            "role": "system",
            "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background context.",
        },
        {
            "role": "user",
            "content": f"Answer the question: \"{question}\" with one paragraph by referring to the background context: \n{final_knowledge}",
        },
    ]
    response = generate_step(args, model, tokenizer, messages)

    loop_i = 0
    if MAX_RESPONSE_LOOP > 1:
        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

    loop_i += 1
    while loop_i < MAX_RESPONSE_LOOP and cons_score_knowledge < THRESHOLD_CONS:
        if args.no_aspect:
            instruction = f"Please refine the response."
        elif args.no_number:
            instruction = f"The alignment and consistency between the response and the context are low. Please refine the response to improve its consistency."
        else:
            instruction = f"The consistency score for the response is {cons_score_knowledge} less than {THRESHOLD_CONS}, which means the alignment and consistency between the response and the context are low. Please refine the response to improve its consistency."

        messages = [
            {
                "role": "system",
                "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background context.",
            },
            {
                "role": "user", 
                "content": f"The generated response for the question: \"{question}\" is \"{response}\" based on the background context: \n{final_knowledge}.\n\n{instruction}"
            },
        ]
        response = generate_step(args, model, tokenizer, messages)

        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

        loop_i += 1

    if MAX_RESPONSE_LOOP > 1 and cons_score_knowledge < THRESHOLD_CONS:
        merge = zip(candidates, entailment_score_question_list)
        merge = sorted(merge)
        candidates, entailment_score_question_list = zip(*merge)
        return candidates[-1][-1], history, entailment_score_question_list[-1]
    else:
        return response, history, entailment_score_question


def reflection_loop(
    args: Args,
    question: str,
    retrieved_docs: List[str],
) -> Tuple[str, str, List[Tuple[int, str, float, float]]]:
    all_history_response = []

    THRESHOLD_ENTAIL = args.threshold_entailment
    MAX_LOOP = args.max_loop

    candidates = []
    main_loop_i = 0
    print(f"main_loop {main_loop_i}")

    retrieved_context = "\n".join(["- " + doc for doc in retrieved_docs])

    final_response, history_response, entailment_score_question = response_loop(args, question, retrieved_context)
    all_history_response += history_response
    candidates.append([entailment_score_question, final_response])

    main_loop_i += 1
    while main_loop_i < MAX_LOOP and entailment_score_question < THRESHOLD_ENTAIL:
        print(f"main_loop {main_loop_i}")

        final_response, history_response, entailment_score_question = response_loop(args, question, retrieved_context)
        all_history_response += history_response
        candidates.append([entailment_score_question, final_response])
        main_loop_i += 1

    if (MAX_LOOP > 1) and entailment_score_question < THRESHOLD_ENTAIL:
        candidates.sort()
        final_response = candidates[-1][1]

    return final_response, all_history_response

In [8]:
def answer_with_rag_reflection(
    args: Args,
    question: str,
    knowledge_index: FAISS,
    use_reranker: bool = True,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[str]]:
    """RAG with self-reflection"""

    # Step 1: Retrieve documents
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # Step 2: Rerank if available
    if use_reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Step 3: Generate response with self-reflection
    print("=> Generating response with self-reflection...")
    final_response, _, = reflection_loop(args, question, relevant_docs)

    return final_response, relevant_docs

In [9]:
num_samples = 2
dataset = []
for d in tqdm(ds.select(range(num_samples)), total=num_samples):
    question = d["question"]
    reference = d["response"]

    final_response, relevant_docs = answer_with_rag_reflection(args, question, KNOWLEDGE_VECTOR_DATABASE, use_reranker=True)

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": final_response,
            "reference": reference,
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

  0%|          | 0/2 [00:00<?, ?it/s]

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 19.17it/s]


=> Generating response with self-reflection...
main_loop 0
response_loop


100%|██████████| 2/2 [00:00<00:00,  6.04it/s]
100%|██████████| 2/2 [00:00<00:00,  5.96it/s]
100%|██████████| 2/2 [00:00<00:00,  5.90it/s]


main_loop 1
response_loop


100%|██████████| 2/2 [00:00<00:00,  6.11it/s]
100%|██████████| 2/2 [00:00<00:00,  5.91it/s]
100%|██████████| 2/2 [00:00<00:00,  5.84it/s]


main_loop 2
response_loop


100%|██████████| 2/2 [00:00<00:00,  6.27it/s]
100%|██████████| 2/2 [00:00<00:00,  5.94it/s]
100%|██████████| 2/2 [00:00<00:00,  5.88it/s]
 50%|█████     | 1/2 [09:00<09:00, 540.35s/it]

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 26.98it/s]


=> Generating response with self-reflection...
main_loop 0
response_loop


100%|██████████| 2/2 [00:00<00:00,  8.96it/s]
100%|██████████| 2/2 [09:37<00:00, 288.73s/it]


In [10]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )
)

E0000 00:00:1761240054.931996 2647483 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  evaluator_llm = LangchainLLMWrapper(


In [11]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

E0000 00:00:1761240058.262086 2647483 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


{'factual_correctness(mode=f1)': 0.4300, 'faithfulness': 0.8295, 'nv_context_relevance': 0.6250}


In [12]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,factual_correctness(mode=f1),faithfulness,nv_context_relevance
0,Which university did one of the key figures in...,"[Out to Win is an American documentary film, r...","Certainly. Below is a **refined, consistent, a...",One of the key figures in the American documen...,0.0,0.75,0.5
1,Were both Léopold Eyharts and Ulrich Walter a ...,"[Léopold Eyharts (born April 28, 1957) is a Br...","No, both Léopold Eyharts and Ulrich Walter wer...","No, only Léopold Eyharts was a General in the ...",0.86,0.909091,0.75


In [13]:
import sys
sys.path.append("ragbench/ragbench")

from evaluation import calculate_metrics

In [14]:
evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

Dataset({
    features: ['user_input', 'retrieved_contexts', 'response', 'reference', 'adherence_score', 'relevance_score', 'faithfulness', 'context_relevance'],
    num_rows: 2
})

In [15]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
metrics

{'hallucination_auroc': 1.0, 'relevance_rmse': np.float64(0.4615988003900188)}