In [None]:
from dotenv import load_dotenv
load_dotenv(".env")

import os
import json
from typing import List, Tuple, Union

import torch
from datasets import Dataset, load_dataset
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

import sys
sys.path.append("self_reflection")

from CTRLEval.ctrleval import CTRLEval
from evaluate.loop_eval_utils import evaluate_knowledge, evaluate_response
from evaluate.sent_similarity import Sent_Similar

In [None]:
ctrleval_scorer = CTRLEval(
    iwf_dir="self_reflection/CTRLEval/iwf_full.txt",
    prompt_dir="self_reflection/CTRLEval/prompt/prompt_topic.txt",
    verbal_dir="self_reflection/CTRLEval/prompt/verbal_topic.txt",
    device="cuda",
)

# Error: Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
# Ignore because Pegasus uses static, sinusoidal position embeddings (rather than learned embeddings) for both encoder and decoder.

entailment_scorer = Sent_Similar()

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda", dtype=torch.bfloat16).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

EMBEDDING_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [None]:
ds_name = "hotpotqa"
#ds_name = "pubmedqa"
#ds_name = "delucionqa"
ds = load_dataset("rungalileo/ragbench", ds_name)
print(len(ds["train"]))
print(len(ds["validation"]))
print(len(ds["test"]))

In [None]:
KNOWLEDGE_VECTOR_DB_PATH = f"vector_store/{EMBEDDING_MODEL_NAME.replace('/', '~')}_{ds_name}"

if os.path.isdir(KNOWLEDGE_VECTOR_DB_PATH):
    print("Loading existing knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
        KNOWLEDGE_VECTOR_DB_PATH,
        embedding_model,
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE,
    )

else:
    RAW_KNOWLEDGE_BASE = []

    for split in ds:
        for d in ds[split]:
            for doc in d["documents"]:
                RAW_KNOWLEDGE_BASE.append(doc)

    RAW_KNOWLEDGE_BASE = list(set(RAW_KNOWLEDGE_BASE))
    print(f"Number of documents in knowledge base: {len(RAW_KNOWLEDGE_BASE)}")

    print("Creating knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_texts(RAW_KNOWLEDGE_BASE, embedding_model, distance_strategy=DistanceStrategy.COSINE)
    KNOWLEDGE_VECTOR_DATABASE.save_local(KNOWLEDGE_VECTOR_DB_PATH)

In [None]:
class Args:
    no_number = False
    no_aspect = False

    max_loop = 1
    max_knowledge_loop = 1
    max_response_loop = 1
    demo_num = 0

    threshold_entailment = 0.8
    threshold_fact = -1
    threshold_consistency = -5

    temperature = 1.0
    top_p = 1
    top_k = 1
    num_beams = 1
    max_new_tokens = 128
    repetition_penalty = 1.0


args = Args()
args.max_loop = 3
args.max_knowledge_loop = 3
args.max_response_loop = 3
args.demo_num = 0
args.threshold_entailment = 0.8
args.threshold_fact = -1.0
args.threshold_consistency = -5
args.max_new_tokens = 1024
args.temperature = 1.0
args.top_p = 0.90
args.top_k = 50
args.repetition_penalty = 1.2

In [None]:
def generate_step(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    messages: List[dict[str, str]],
    max_new_tokens: int = 512,
    temperature: float = 0.2,
    top_p: float = 0.9,
    top_k: int = 50,
    num_beams: int = 1,
    repetition_penalty: float = 1.1,
    dola_decoding: bool = False,
    activation_dola_decoding: bool = False,
    dola_layers: Union[str, list[int]] = "high",
    sled_decoding: bool = False,
    activation_sled_decoding: bool = False,
    end_sled_decoding: bool = False,
    evolution_rate: float = 2.0,
    evolution_scale: int = 10,
    evolution_lower_bound: float = -1000.0,
) -> str:
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False).to(model.device)
    if dola_decoding:
        print("=> Using DOLA decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/dola",
            trust_remote_code=True,
            dola_layers=dola_layers,
        )
    elif activation_dola_decoding:
        print("=> Using Activation DOLA decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/activation_dola",
            trust_remote_code=True,
            dola_layers=dola_layers,
            alpha=0.5,
        )
    elif sled_decoding:
        print("=> Using SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
        )
    elif activation_sled_decoding:
        print("=> Using Activation SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/activation_sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
            alpha=0.5,
        )
    elif end_sled_decoding:
        print("=> Using End-SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/end_sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
            alpha=0.5,
        )
    else:
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(outputs[0][inputs["input_ids"].size(1) :], skip_special_tokens=True)


def knowledge_loop(
    args: Args,
    question: str,
    retrieved_docs: List[str],
) -> Tuple[str, List[Tuple[int, str, float]]]:
    print("knowledge_loop")

    THRESHOLD_FACTUAL = args.threshold_fact
    MAX_KNOWLEDGE_LOOP = args.max_knowledge_loop

    candidates = []
    history = []

    retrieved_context = "\n".join(["- " + doc for doc in retrieved_docs])
    messages = [
        {
            "role": "system",
            "content": """You are an AI language model designed to provide accurate, relevant, and comprehensive background knowledge based on the given question and retrieved context, with the ability to incorporate additional context when necessary. \
Sometimes, the retrieved context may not be sufficient to answer the question accurately. In such cases, you should use your general knowledge to supplement the information from the retrieved context.""",
        },
        {
            "role": "user",
            "content": f'Provide background knowledge in one paragraph (approximately 400 words) to answer the question: "{question}", based on the following retrieved context:\n{retrieved_context}',
        },
    ]
    knowledge = generate_step(
        model, 
        tokenizer, 
        messages,
        max_new_tokens=args.max_new_tokens,
        temperature=args.temperature,
        top_p=args.top_p,
        top_k=args.top_k,
        num_beams=args.num_beams,
        repetition_penalty=args.repetition_penalty,
        activation_sled_decoding=True,
        evolution_rate=0.5,
        evolution_scale=75,
    )

    loop_i = 0
    if MAX_KNOWLEDGE_LOOP > 1:
        factuality_score = evaluate_knowledge(model, args.demo_num, question, knowledge, tokenizer)
        candidates.append([factuality_score, knowledge])
        history.append([loop_i, knowledge, factuality_score])

    loop_i += 1
    while (loop_i < MAX_KNOWLEDGE_LOOP) and factuality_score < THRESHOLD_FACTUAL:
        if args.no_aspect:
            instruction = f"Please refine the knowledge."
        elif args.no_number:
            instruction = f"The knowledge is not strongly supported by empirical evidence. Please refine the knowledge to improve its factuality."
        else:
            instruction = f"The factuality score for the knowledge is {factuality_score} less than {THRESHOLD_FACTUAL}, which means the knowledge is not strongly supported by empirical evidence. Please refine the knowledge to improve its factuality."

        messages = [
            {
                "role": "system",
                "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive background knowledge based on the given question and retrieved context, with the ability to incorporate additional context when necessary.",
            },
            {
                "role": "user",
                "content": f'The provided background knowledge for the question: "{question}" is "{knowledge}", based on the following retrieved context:\n{retrieved_context}".\n\n{instruction}\nThe refined knowledge should be in one paragraph (approximately 400 words).',
            },
        ]
        knowledge = generate_step(
            model, 
            tokenizer, 
            messages,
            max_new_tokens=args.max_new_tokens,
            temperature=args.temperature,
            top_p=args.top_p,
            top_k=args.top_k,
            num_beams=args.num_beams,
            repetition_penalty=args.repetition_penalty,
            activation_sled_decoding=True,
            evolution_rate=0.5,
            evolution_scale=75,
        )

        factuality_score = evaluate_knowledge(model, args.demo_num, question, knowledge, tokenizer)

        candidates.append([factuality_score, knowledge])
        history.append([loop_i, knowledge, factuality_score])
        loop_i += 1

    if (MAX_KNOWLEDGE_LOOP > 1) and factuality_score < THRESHOLD_FACTUAL:
        candidates.sort()
        return candidates[-1][-1], history
    else:
        return knowledge, history


def response_loop(
    args: Args,
    question: str,
    final_knowledge: str,
) -> Tuple[str, List[Tuple[int, str, float, float]], float]:
    print("response_loop")

    THRESHOLD_CONS = args.threshold_consistency
    MAX_RESPONSE_LOOP = args.max_response_loop

    candidates = []
    entailment_score_question_list = []
    history = []

    messages = [
        {
            "role": "system",
            "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background knowledge.",
        },
        {
            "role": "user",
            "content": f'Refer to the background knowledge: "{final_knowledge}" and answer the question: "{question}" with one paragraph.',
        },
    ]
    response = generate_step(
        model, 
        tokenizer, 
        messages,
        max_new_tokens=args.max_new_tokens,
        temperature=args.temperature,
        top_p=args.top_p,
        top_k=args.top_k,
        num_beams=args.num_beams,
        repetition_penalty=args.repetition_penalty,
        activation_sled_decoding=True,
        evolution_rate=0.5,
        evolution_scale=75,
    )

    loop_i = 0
    if MAX_RESPONSE_LOOP > 1:
        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

    loop_i += 1
    while loop_i < MAX_RESPONSE_LOOP and cons_score_knowledge < THRESHOLD_CONS:
        if args.no_aspect:
            instruction = f"Please refine the response."
        elif args.no_number:
            instruction = f"The alignment and consistency between response and knowledge are low. Please refine the response to improve its consistency."
        else:
            instruction = f"The consistency score for the response is {cons_score_knowledge} less than {THRESHOLD_CONS}, which means the alignment and consistency between response and knowledge are low. Please refine the response to improve its consistency."

        messages = [
            {
                "role": "system",
                "content": "You are an AI language model designed to provide accurate, relevant, and comprehensive answers to questions based on the given background knowledge.",
            },
            {
                "role": "user", 
                "content": f'The generated response for the question: "{question}" is "{response}" based on the background knowledge: "{final_knowledge}".\n\n{instruction}',
            },
        ]
        response = generate_step(
            model, 
            tokenizer, 
            messages,
            max_new_tokens=args.max_new_tokens,
            temperature=args.temperature,
            top_p=args.top_p,
            top_k=args.top_k,
            num_beams=args.num_beams,
            repetition_penalty=args.repetition_penalty,
            activation_sled_decoding=True,
            evolution_rate=0.5,
            evolution_scale=75,
        )

        entailment_score_question, cons_score_knowledge = evaluate_response(entailment_scorer, ctrleval_scorer, question, response, final_knowledge)
        candidates.append([(entailment_score_question + cons_score_knowledge) / 2, response])
        entailment_score_question_list.append(entailment_score_question)
        history.append([loop_i, response, entailment_score_question, cons_score_knowledge])

        loop_i += 1

    if MAX_RESPONSE_LOOP > 1 and cons_score_knowledge < THRESHOLD_CONS:
        merge = zip(candidates, entailment_score_question_list)
        merge = sorted(merge)
        candidates, entailment_score_question_list = zip(*merge)
        return candidates[-1][-1], history, entailment_score_question_list[-1]
    else:
        return response, history, entailment_score_question


def reflection_loop(
    args: Args,
    question: str,
    retrieved_docs: List[str],
) -> Tuple[str, str, List[Tuple[int, str, float]], List[Tuple[int, str, float, float]]]:
    all_history_knowledge, all_history_response = [], []

    THRESHOLD_ENTAIL = args.threshold_entailment
    MAX_LOOP = args.max_loop

    candidates = []
    main_loop_i = 0
    print(f"main_loop {main_loop_i}")

    final_knowledge, history_knowledge = knowledge_loop(args, question, retrieved_docs)
    all_history_knowledge += history_knowledge

    final_response, history_response, entailment_score_question = response_loop(args, question, final_knowledge)
    all_history_response += history_response
    candidates.append([entailment_score_question, final_knowledge, final_response])

    main_loop_i += 1
    while main_loop_i < MAX_LOOP and entailment_score_question < THRESHOLD_ENTAIL:
        print(f"main_loop {main_loop_i}")
        final_knowledge, history_knowledge = knowledge_loop(args, question, retrieved_docs)
        all_history_knowledge += history_knowledge

        final_response, history_response, entailment_score_question = response_loop(args, question, final_knowledge)
        all_history_response += history_response
        candidates.append([entailment_score_question, final_knowledge, final_response])
        main_loop_i += 1

    if (MAX_LOOP > 1) and entailment_score_question < THRESHOLD_ENTAIL:
        candidates.sort()
        final_knowledge, final_response = candidates[-1][1:]

    return final_knowledge, final_response, all_history_knowledge, all_history_response

In [None]:
def answer_with_rag_reflection(
    args: Args,
    question: str,
    use_reranker: bool = True,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[str]]:
    """RAG with self-reflection"""

    # Step 1: Retrieve documents
    print("=> Retrieving documents...")
    relevant_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # Step 2: Rerank if available
    if use_reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Step 3: Generate response with self-reflection
    print("=> Generating response with self-reflection...")
    _, final_response, _, _ = reflection_loop(args, question, relevant_docs)

    return final_response, relevant_docs

In [None]:
#response, relevant_docs = answer_with_rag_reflection(args, ds[0]['question'], KNOWLEDGE_VECTOR_DATABASE, use_reranker=True)

In [None]:
num_samples = 100
dataset = []
for d in tqdm(ds["test"].select(range(num_samples)), total=num_samples, desc="Processing test samples"):
    question = d["question"]
    reference = d["response"]

    final_response, relevant_docs = answer_with_rag_reflection(args, question, use_reranker=True)

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": final_response,
            "reference": reference,
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

output_dir = "results/exp-3"
with open(os.path.join(output_dir, f"{MODEL_NAME.replace('/', '~')}_{ds_name}_rag-responses_activation-sled-decoding_self-reflection-full.json"), "w") as f:
    json.dump(dataset, f, indent=2)

In [None]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-lite",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )
)

In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

In [None]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.to_csv(os.path.join(output_dir, f"{MODEL_NAME.replace('/', '~')}_{ds_name}_ragas-results_activation-sled-decoding_self-reflection-full.csv"), index=False)
#ragas_result_df.head()

In [None]:
import sys

sys.path.append("ragbench/ragbench")

from evaluation import calculate_metrics

In [None]:
evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

In [None]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
print(metrics)