In [1]:
from dotenv import load_dotenv
load_dotenv(".env")

import os
from typing import List, Optional, Tuple

import torch
import json
from datasets import Dataset, load_dataset
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from transformers import AutoModelForCausalLM, AutoTokenizer, Pipeline, pipeline
from tqdm import tqdm

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [2]:
ds_name = "hotpotqa"
#ds_name = "pubmedqa"
#ds_name = "delucionqa"
ds = load_dataset("rungalileo/ragbench", ds_name)
print(len(ds['train']))
print(len(ds['validation']))
print(len(ds['test']))

1883
424
390


In [3]:
EMBEDDING_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=False,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DB_PATH = f"vector_store/{EMBEDDING_MODEL_NAME.replace('/', '~')}_{ds_name}"

if os.path.isdir(KNOWLEDGE_VECTOR_DB_PATH):
    print("Loading existing knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
        KNOWLEDGE_VECTOR_DB_PATH, 
        embedding_model, 
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE,
    )

else:
    RAW_KNOWLEDGE_BASE = []

    for split in ds:
        for d in ds[split]:
            for doc in d["documents"]:
                RAW_KNOWLEDGE_BASE.append(doc)

    RAW_KNOWLEDGE_BASE = list(set(RAW_KNOWLEDGE_BASE))
    print(f"Number of documents in knowledge base: {len(RAW_KNOWLEDGE_BASE)}")

    print("Creating knowledge vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_texts(RAW_KNOWLEDGE_BASE, embedding_model, distance_strategy=DistanceStrategy.COSINE)
    KNOWLEDGE_VECTOR_DATABASE.save_local(KNOWLEDGE_VECTOR_DB_PATH)

Loading existing knowledge vector database...


In [4]:
READER_MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
#READER_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, device_map="cuda", dtype=torch.bfloat16).eval()
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=512,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda


In [5]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """You are a chatbot providing answers to user queries. You will be given one or more context documents, and a question. \
Use the information in the documents to answer the question.

If the documents do not provide enough information for you to answer the question, then say \
"The documents are missing some of the information required to answer the question." Don't quote any external knowledge that is \
not in the documents. Don't try to make up an answer.""",
    },
    {
        "role": "user",
        "content": """Answer the question using the provided context.

Context:
{context}

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(prompt_in_chat_format, tokenize=False, add_generation_prompt=True)
print(RAG_PROMPT_TEMPLATE)

<|im_start|>system
You are a chatbot providing answers to user queries. You will be given one or more context documents, and a question. Use the information in the documents to answer the question.

If the documents do not provide enough information for you to answer the question, then say "The documents are missing some of the information required to answer the question." Don't quote any external knowledge that is not in the documents. Don't try to make up an answer.<|im_end|>
<|im_start|>user
Answer the question using the provided context.

Context:
{context}

Question: {question}<|im_end|>
<|im_start|>assistant



In [6]:
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

  self.scaler = torch.cuda.amp.GradScaler()


In [None]:
def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 50,
    num_docs_final: int = 10,
) -> Tuple[str, List[str]]:
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    final_prompt = RAG_PROMPT_TEMPLATE.format(
        question=question, context="\n".join(["- " + doc for doc in relevant_docs])
    )

    print("=> Generating answer...")
    response = llm(final_prompt)[0]["generated_text"]
    return response, relevant_docs

In [None]:
num_samples = 100
dataset = []
for d in tqdm(ds['test'].select(range(num_samples)), total=num_samples, desc="Processing test samples"):
    question = d["question"]
    reference = d["response"]
    response, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)

    # relevant_docs = d['documents']  # For testing purpose, use reference docs as retrieved docs
    # response = reference  # For testing purpose, use reference as response

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": response,
            "reference": reference,
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

output_dir = "results/exp-1"
with open(os.path.join(output_dir, f"{READER_MODEL_NAME.replace('/', '~')}_{ds_name}_rag-responses.json"), "w") as f:
    json.dump(dataset, f, indent=2)

In [None]:
# from langchain_huggingface import HuggingFacePipeline

# model_id = "Qwen/Qwen3-4B-Instruct-2507"
# evaluator_llm = LangchainLLMWrapper(
#    HuggingFacePipeline(
#        pipeline=pipeline(
#            "text-generation",
#            model=AutoModelForCausalLM.from_pretrained(model_id),
#            tokenizer=AutoTokenizer.from_pretrained(model_id),
#            device_map="auto",
#            max_new_tokens=32,
#        )
#    )
# )


evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-lite",
        temperature=0.1,
        max_tokens=None,
        timeout=None,
        max_retries=4,
    )
)

In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

In [None]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.to_csv(os.path.join(output_dir, f"{READER_MODEL_NAME.replace('/', '~')}_{ds_name}_ragas-results.csv"), index=False)
#ragas_result_df.head()

In [None]:
import sys
sys.path.append("ragbench/ragbench")

from evaluation import calculate_metrics

In [None]:
evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

In [None]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
print(metrics)