In [None]:
from dotenv import load_dotenv
load_dotenv(".env")

import os
import json

import torch
from datasets import Dataset, load_dataset
from langchain_google_genai import ChatGoogleGenerativeAI
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import FactualCorrectness
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [None]:
ds_name = "hotpotqa"
#ds_name = "pubmedqa"
#ds_name = "delucionqa"
ds = load_dataset("rungalileo/ragbench", ds_name, split="test")
print(len(ds))

In [None]:
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
#MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda", dtype=torch.bfloat16).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
num_samples = 100
dataset = []
for d in tqdm(ds.select(range(num_samples)), total=num_samples, desc="Processing test samples"):
    question = d["question"]
    reference = d["response"]

    messages = [
        {
            "role": "user",
            "content": f"""Answer the following question in one paragraph.

Question: {question}
Answer:""",
        },
    ]

    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=1.0,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.decode(outputs[0][inputs["input_ids"].size(1) :], skip_special_tokens=True)

    dataset.append(
        {
            "user_input": question,
            "response": response,
            "reference": reference,
        }
    )

output_dir = "results/exp-0"
with open(os.path.join(output_dir, f"{MODEL_NAME.replace('/', '~')}_{ds_name}_responses.json"), "w") as f:
    json.dump(dataset, f, indent=2)

In [None]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-lite",
        temperature=0.1,
        max_tokens=None,
        timeout=None,
        max_retries=4,
    )
)

In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness()], llm=evaluator_llm)
print(ragas_result)

In [None]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.to_csv(os.path.join(output_dir, f"{MODEL_NAME.replace('/', '~')}_{ds_name}_ragas-results.csv"), index=False)
#ragas_result_df.head()