In [None]:
import sys
sys.path.append("self_reflection")

import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_google_genai import ChatGoogleGenerativeAI
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import FactualCorrectness

from CTRLEval.ctrleval import CTRLEval
from evaluate.sent_similarity import Sent_Similar
from loop_utils import main_loop
from loop import knowledge_loop, response_loop

In [None]:
ds = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")

In [None]:
ds[0]['question']

In [None]:
ctrleval_scorer = CTRLEval(
    iwf_dir="self_reflection/CTRLEval/iwf_full.txt",
    prompt_dir="self_reflection/CTRLEval/prompt/prompt_topic.txt",
    verbal_dir="self_reflection/CTRLEval/prompt/verbal_topic.txt",
    device='cuda',
)
entailment_scorer = Sent_Similar()


In [None]:
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
class Args:
    continue_generate = False
    no_number = False
    no_aspect = False

    max_loop = 1
    max_knowledge_loop = 1
    max_response_loop = 1
    demo_num = 0

    threshold_entailment = 0.8
    threshold_fact = -1
    threshold_consistency = -5

    max_sample = 3000
    temperature = 1.0
    top_p = 1
    top_k = 1
    num_beams = 1
    max_new_tokens = 128

In [None]:
args = Args()
args.max_loop = 3
args.max_knowledge_loop = 3
args.max_response_loop = 3
args.demo_num = 0
args.threshold_entailment = 0.8
args.threshold_fact = -1.0
args.threshold_consistency = -5
args.max_new_tokens = 256

#final_knowledge, final_response, all_history_knowledge, all_history_response = main_loop(args, ds[0], model, tokenizer, knowledge_loop, response_loop, entailment_scorer, ctrleval_scorer)

In [None]:
#print(final_response)

In [None]:
dataset = []
for d in ds.select(range(100)):
    question = d["question"]
    reference = d["response"]

    _, final_response, _, _ = main_loop(args, d, model, tokenizer, knowledge_loop, response_loop, entailment_scorer, ctrleval_scorer)

    dataset.append(
        {
            "user_input": question,
            "response": final_response,
            "reference": reference,
        }
    )

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness()], llm=evaluator_llm)
print(ragas_result)