In [1]:
from dotenv import load_dotenv
load_dotenv(".env")

import os
from typing import List, Tuple, Union

import torch
import json
from datasets import Dataset, load_dataset
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [2]:
ds_name = "hotpotqa"
#ds_name = "pubmedqa"
#ds_name = "delucionqa"
ds = load_dataset("rungalileo/ragbench", ds_name)
print(len(ds["train"]))
print(len(ds["validation"]))
print(len(ds["test"]))

1883
424
390


In [3]:
EMBEDDING_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=False,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DB_PATH = f"vector_store/{EMBEDDING_MODEL_NAME.replace('/', '~')}_{ds_name}"

if os.path.isdir(KNOWLEDGE_VECTOR_DB_PATH):
    print("Loading existing knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
        KNOWLEDGE_VECTOR_DB_PATH,
        embedding_model,
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE,
    )

else:
    RAW_KNOWLEDGE_BASE = []

    for split in ds:
        for d in ds[split]:
            for doc in d["documents"]:
                RAW_KNOWLEDGE_BASE.append(doc)

    RAW_KNOWLEDGE_BASE = list(set(RAW_KNOWLEDGE_BASE))
    print(f"Number of documents in knowledge base: {len(RAW_KNOWLEDGE_BASE)}")

    print("Creating knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_texts(RAW_KNOWLEDGE_BASE, embedding_model, distance_strategy=DistanceStrategy.COSINE)
    KNOWLEDGE_VECTOR_DATABASE.save_local(KNOWLEDGE_VECTOR_DB_PATH)

Loading existing knowledge vector database...


In [4]:
READER_MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"

model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, device_map="cuda", dtype=torch.bfloat16).eval()
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  self.scaler = torch.cuda.amp.GradScaler()


In [5]:
def generate_step(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    messages: List[dict[str, str]],
    max_new_tokens: int = 512,
    temperature: float = 0.2,
    top_p: float = 0.9,
    top_k: int = 50,
    num_beams: int = 1,
    repetition_penalty: float = 1.1,
    dola_decoding: bool = False,
    activation_dola_decoding: bool = False,
    dola_layers: Union[str, list[int]] = "high",
    sled_decoding: bool = False,
    activation_sled_decoding: bool = False,
    end_sled_decoding: bool = False,
    evolution_rate: float = 2.0,
    evolution_scale: int = 10,
    evolution_lower_bound: float = -1000.0,
) -> str:
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False).to(model.device)
    if dola_decoding:
        print("=> Using DOLA decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/dola",
            trust_remote_code=True,
            dola_layers=dola_layers,
        )
    elif activation_dola_decoding:
        print("=> Using Activation DOLA decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/activation_dola",
            trust_remote_code=True,
            dola_layers=dola_layers,
            alpha=0.5,
        )
    elif sled_decoding:
        print("=> Using SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
        )
    elif activation_sled_decoding:
        print("=> Using Activation SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/activation_sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
            alpha=0.5,
        )
    elif end_sled_decoding:
        print("=> Using End-SLED decoding...")
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            custom_generate="custom_decoding/end_sled",
            trust_remote_code=True,
            evolution_rate=evolution_rate,
            evolution_scale=evolution_scale,
            evolution_lower_bound=evolution_lower_bound,
            alpha=0.5,
        )
    else:
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(outputs[0][inputs["input_ids"].size(1) :], skip_special_tokens=True)


def answer_with_rag(
    question: str,
    use_reranker: bool = True,
    num_retrieved_docs: int = 50,
    num_docs_final: int = 10,
    dola_decoding=False,
    activation_dola_decoding=False,
    sled_decoding=False,
    activation_sled_decoding=False,
    end_sled_decoding=False,
) -> Tuple[str, List[str]]:
    print("=> Retrieving documents...")
    relevant_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    if use_reranker:
        print("=> Reranking documents...")
        relevant_docs = RERANKER.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    print("=> Generating answer...")
    messages = [
        {
            "role": "system",
            "content": """You are a chatbot providing answers to user queries. You will be given one or more context documents, and a question. \
Use the information in the documents to answer the question.

If the documents do not provide enough information to answer the question, provide the best possible answer based on your existing knowledge. \
Do not make up facts or provide incorrect information.""",
        },
        {
            "role": "user",
            "content": """Answer the question in one paragraph using the provided context.

Context:
{context}

Question: {question}""".format(
                context="\n".join(["- " + doc for doc in relevant_docs]),
                question=question,
            ),
        },
    ]
    response = generate_step(
        model,
        tokenizer,
        messages,
        max_new_tokens=512,
        temperature=0.2,
        repetition_penalty=1.2 if dola_decoding or sled_decoding else 1.1,
        dola_decoding=dola_decoding,
        activation_dola_decoding=activation_dola_decoding,
        sled_decoding=sled_decoding,
        activation_sled_decoding=activation_sled_decoding,
        end_sled_decoding=end_sled_decoding,
    )
    return response, relevant_docs

In [6]:
response, relevant_docs = answer_with_rag(ds["test"][0]["question"], use_reranker=True, end_sled_decoding=True)
print(response)

=> Retrieving documents...


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


=> Reranking documents...


100%|██████████| 2/2 [00:00<00:00, 28.68it/s]
`repetition_penalty` is set to a value of 1.1, which could induce unwanted repetition. The recommended value for SELD decoding is `repetition_penalty>=1.2`.


=> Generating answer...
=> Using End-SLED decoding...
0.5
None of the key figures mentioned in the documentary *Out to Win* (directed by Malcolm Ingram) are stated to have attended a specific university prior to being drafted by the New Jersey Nets. Additionally, there is no mention of any individual being selected 18th overall by the New Jersey Nets in the provided context. Therefore, based on the available information, it is impossible to determine which university one of these individuals paid for before being drafted. The reference to the New Jersey Nets drafting someone 18th overall does not align with any factual detail in the text regarding the film's subjects. Hence, the question cannot be answered with certainty using the provided context.


In [None]:
num_samples = 100
dataset = []
for d in tqdm(ds["test"].select(range(num_samples)), total=num_samples, desc="Processing test samples"):
    question = d["question"]
    reference = d["response"]
    response, relevant_docs = answer_with_rag(question, use_reranker=True, dola_decoding=True)

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": response,
            "reference": reference,
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

output_dir = "results/exp-2"
with open(os.path.join(output_dir, f"{READER_MODEL_NAME.replace('/', '~')}_{ds_name}_rag-responses.json"), "w") as f:
    json.dump(dataset, f, indent=2)

In [None]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-lite",
        temperature=0.1,
        max_tokens=None,
        timeout=None,
        max_retries=4,
    )
)

In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

In [None]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.to_csv(os.path.join(output_dir, f"{READER_MODEL_NAME.replace('/', '~')}_{ds_name}_ragas-results.csv"), index=False)
#ragas_result_df.head()

In [None]:
import sys
sys.path.append("ragbench/ragbench")

from evaluation import calculate_metrics

In [None]:
evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

In [None]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
print(metrics)