In [None]:
from dotenv import load_dotenv

load_dotenv(".env")

from typing import List, Optional, Tuple

import torch
from datasets import Dataset, load_dataset
from langchain.docstore.document import Document as LangchainDocument
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ContextRelevance, FactualCorrectness, Faithfulness
from ragatouille import RAGPretrainedModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Pipeline, pipeline

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [2]:
ds = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")

In [4]:
RAW_KNOWLEDGE_BASE = []

for d in ds:
    for doc in d["documents"]:
        RAW_KNOWLEDGE_BASE.append(doc)

RAW_KNOWLEDGE_BASE = [LangchainDocument(page_content=doc) for doc in set(RAW_KNOWLEDGE_BASE)]

print(f"Number of documents in knowledge base: {len(RAW_KNOWLEDGE_BASE)}")

Number of documents in knowledge base: 1550


In [5]:
EMBEDDING_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    RAW_KNOWLEDGE_BASE, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [6]:
READER_MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [8]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """You are a chatbot providing answers to user queries. You will be given one or more context documents, and a question. \
Use the information in the documents to answer the question.

If the documents do not provide enough information for you to answer the question, then say \
"The documents are missing some of the information required to answer the question." Don't quote any external knowledge that is \
not in the documents. Don't try to make up an answer.""",
    },
    {
        "role": "user",
        "content": """Answer the question using the provided context.

Context:
{context}

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(prompt_in_chat_format, tokenize=False, add_generation_prompt=True)
print(RAG_PROMPT_TEMPLATE)

<|im_start|>system
You are a chatbot providing answers to user queries. You will be given one or more context documents, and a question. Use the information in the documents to answer the question.

If the documents do not provide enough information for you to answer the question, then say "The documents are missing some of the information required to answer the question." Don't quote any external knowledge that is not in the documents. Don't try to make up an answer.<|im_end|>
<|im_start|>user
Answer the question using the provided context.

Context:
{context}

Question: {question}<|im_end|>
<|im_start|>assistant



In [10]:
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

  self.scaler = torch.cuda.amp.GradScaler()


In [11]:
def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    final_prompt = RAG_PROMPT_TEMPLATE.format(
        question=question, context="\n".join(["- " + doc for doc in relevant_docs])
    )

    print("=> Generating answer...")
    response = llm(final_prompt)[0]["generated_text"]
    return response, relevant_docs

In [None]:
dataset = []
for d in ds.select(range(100)):
    question = d["question"]
    reference = d["response"]
    response, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)

    # relevant_docs = d['documents']  # For testing purpose, use reference docs as retrieved docs
    # response = reference  # For testing purpose, use reference as response

    dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": relevant_docs,
            "response": response,
            "reference": reference,  # NOTE: Not used in evaluation, just for record
            "adherence_score": d["adherence_score"],
            "relevance_score": d["relevance_score"],
        }
    )

=> Retrieving documents...


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 43.73it/s]

=> Generating answer...





=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 59.04it/s]

=> Generating answer...





In [None]:
# from langchain_huggingface import HuggingFacePipeline

# model_id = "Qwen/Qwen3-4B-Instruct-2507"
# evaluator_llm = LangchainLLMWrapper(
#    HuggingFacePipeline(
#        pipeline=pipeline(
#            "text-generation",
#            model=AutoModelForCausalLM.from_pretrained(model_id),
#            tokenizer=AutoTokenizer.from_pretrained(model_id),
#            device_map="auto",
#            max_new_tokens=32,
#        )
#    )
# )

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
evaluator_llm = LangchainLLMWrapper(llm)

E0000 00:00:1760925388.056067 4095465 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  evaluator_llm = LangchainLLMWrapper(llm)


In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness(), Faithfulness(), ContextRelevance()], llm=evaluator_llm)
print(ragas_result)

evaluation_dataset = evaluation_dataset.add_column("faithfulness", ragas_result["faithfulness"])
evaluation_dataset = evaluation_dataset.add_column("context_relevance", ragas_result["nv_context_relevance"])
evaluation_dataset

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

E0000 00:00:1760925391.371447 4095465 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Dataset({
    features: ['user_input', 'retrieved_contexts', 'response', 'reference', 'adherence_score', 'relevance_score', 'faithfulness', 'context_relevance'],
    num_rows: 2
})

In [16]:
import sys

sys.path.append("ragbench/ragbench")
from evaluation import calculate_metrics

In [17]:
metrics = calculate_metrics(
    evaluation_dataset,
    pred_adherence="faithfulness",  # adherence_score
    pred_context_relevance="context_relevance",  # relevance_score
)
metrics

{'hallucination_auroc': 1.0, 'relevance_rmse': np.float64(0.7056741811375147)}