In [1]:
import sys
sys.path.append("self_reflection")

from dotenv import load_dotenv
load_dotenv(".env")

import torch
from datasets import Dataset, load_dataset
from langchain_google_genai import ChatGoogleGenerativeAI
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import FactualCorrectness
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from tqdm import tqdm

from CTRLEval.ctrleval import CTRLEval
from evaluate.sent_similarity import Sent_Similar
from loop import knowledge_loop, response_loop
from loop_utils import main_loop

In [2]:
ds = load_dataset("rungalileo/ragbench", "hotpotqa", split="test")

In [3]:
ds[0]['question']

'Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?'

In [None]:
ctrleval_scorer = CTRLEval(
    iwf_dir="self_reflection/CTRLEval/iwf_full.txt",
    prompt_dir="self_reflection/CTRLEval/prompt/prompt_topic.txt",
    verbal_dir="self_reflection/CTRLEval/prompt/verbal_topic.txt",
    device='cuda',
)

# Error: Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
# Ignore because Pegasus uses static, sinusoidal position embeddings (rather than learned embeddings) for both encoder and decoder.

entailment_scorer = Sent_Similar()

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
class Args:
    continue_generate = False
    no_number = False
    no_aspect = False

    max_loop = 1
    max_knowledge_loop = 1
    max_response_loop = 1
    demo_num = 0

    threshold_entailment = 0.8
    threshold_fact = -1
    threshold_consistency = -5

    max_sample = 3000
    temperature = 1.0
    top_p = 1
    top_k = 1
    num_beams = 1
    max_new_tokens = 128

In [7]:
args = Args()
args.max_loop = 3
args.max_knowledge_loop = 3
args.max_response_loop = 3
args.demo_num = 0
args.threshold_entailment = 0.8
args.threshold_fact = -1.0
args.threshold_consistency = -5
args.max_new_tokens = 256

#final_knowledge, final_response, all_history_knowledge, all_history_response = main_loop(args, ds[0], model, tokenizer, knowledge_loop, response_loop, entailment_scorer, ctrleval_scorer)

In [8]:
#print(final_response)

In [9]:
num_samples = 2
dataset = []
for d in tqdm(ds.select(range(num_samples)), total=num_samples):
    question = d["question"]
    reference = d["response"]

    _, final_response, _, _ = main_loop(args, d, model, tokenizer, knowledge_loop, response_loop, entailment_scorer, ctrleval_scorer)

    dataset.append(
        {
            "user_input": question,
            "response": final_response,
            "reference": reference,
        }
    )

  0%|          | 0/2 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_k', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'temperature': 0.7, 'top_p': 0.8, 'bos_token_id': 151643}. If this is not desired, please set these values explicitly.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


main_loop 0
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge:  

The question is asking about a person who was a key figure in the American documentary film released in 2015, directed by Malcolm Ingram. The film in question is "The 18th Man," which was released in 2015 and directed by Malcolm Ingram. The film focuses on the story of a man who was drafted 18th overall in the 1984 NBA Draft, and it explores his journey and the challenges he faced as a player. The key figure in the film is the man himself, who was drafted 18th overall in the 1984 NBA Draf

100%|██████████| 2/2 [00:00<00:00, 28.52it/s]


main_loop 1
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge:  

The question is asking about a person who was a key figure in the American documentary film released in 2015, directed by Malcolm Ingram. The film in question is "The 18th Man," which was released in 2015 and directed by Malcolm Ingram. The film focuses on the story of a man who was drafted 18th overall in the 1984 NBA Draft, and it explores his journey and the challenges he faced as a player. The key figure in the film is the man himself, who was drafted 18th overall in the 1984 NBA Draf

100%|██████████| 2/2 [00:00<00:00, 29.32it/s]


main_loop 2
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Which university did one of the key figures in the American documentary film, released in 2015, directed by Malcolm Ingram, pay for, before being drafted 18th overall pick for the New Jersey Nets?
Knowledge:  

The question is asking about a person who was a key figure in the American documentary film released in 2015, directed by Malcolm Ingram. The film in question is "The 18th Man," which was released in 2015 and directed by Malcolm Ingram. The film focuses on the story of a man who was drafted 18th overall in the 1984 NBA Draft, and it explores his journey and the challenges he faced as a player. The key figure in the film is the man himself, who was drafted 18th overall in the 1984 NBA Draf

100%|██████████| 2/2 [00:00<00:00, 25.16it/s]
 50%|█████     | 1/2 [05:52<05:52, 352.85s/it]

main_loop 0
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Were both Léopold Eyharts and Ulrich Walter a General in the French Air Force? 
Knowledge:  

As of 2023, the French Air Force has 13 general officers, including 10 full generals and 3 lieutenant generals. The rank of general in the French Air Force is equivalent to the rank of general in the French Army and the French Navy. The rank of general is a high-ranking position, typically held by experienced officers who have served in key leadership roles within the French Air Force. The rank of general is not a hereditary title and is not a title of nobility. It is a professional military rank, and it is awarded based on merit and experience. The rank of general is not a title of office, but rather a

100%|██████████| 2/2 [00:00<00:00, 27.62it/s]


main_loop 1
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Were both Léopold Eyharts and Ulrich Walter a General in the French Air Force? 
Knowledge:  

As of 2023, the French Air Force has 13 general officers, including 10 full generals and 3 lieutenant generals. The rank of general in the French Air Force is equivalent to the rank of general in the French Army and the French Navy. The rank of general is a high-ranking position, typically held by experienced officers who have served in key leadership roles within the French Air Force. The rank of general is not a hereditary title and is not a title of nobility. It is a professional military rank, and it is awarded based on merit and experience. The rank of general is not a title of office, but rather a

100%|██████████| 2/2 [00:00<00:00, 28.53it/s]


main_loop 2
knowledge_loop
process:0/1
text:  Based on Question, please generate the factual knowledge. To do this, please consider these factors: Verifiability, Objectivity, and Reliability of Source. Note that this evaluation should be based on the best available medical knowledge.

Question: Were both Léopold Eyharts and Ulrich Walter a General in the French Air Force? 
Knowledge:  

As of 2023, the French Air Force has 13 general officers, including 10 full generals and 3 lieutenant generals. The rank of general in the French Air Force is equivalent to the rank of general in the French Army and the French Navy. The rank of general is a high-ranking position, typically held by experienced officers who have served in key leadership roles within the French Air Force. The rank of general is not a hereditary title and is not a title of nobility. It is a professional military rank, and it is awarded based on merit and experience. The rank of general is not a title of office, but rather a

100%|██████████| 2/2 [00:00<00:00, 29.24it/s]
100%|██████████| 2/2 [11:00<00:00, 330.09s/it]


In [10]:
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )
)

E0000 00:00:1761039654.507590  832122 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  evaluator_llm = LangchainLLMWrapper(


In [11]:
evaluation_dataset = Dataset.from_list(dataset)

ragas_result = evaluate(evaluation_dataset, metrics=[FactualCorrectness()], llm=evaluator_llm)
print(ragas_result)

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

E0000 00:00:1761039655.301941  832122 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


{'factual_correctness(mode=f1)': 0.0000}


In [12]:
ragas_result_df = ragas_result.to_pandas()
ragas_result_df.head()

Unnamed: 0,user_input,response,reference,factual_correctness(mode=f1)
0,Which university did one of the key figures in...,\n\nThe key figure in the American documentar...,One of the key figures in the American documen...,0.0
1,Were both Léopold Eyharts and Ulrich Walter a ...,\n\nStructure: \n1. Begin with a clear answer...,"No, only Léopold Eyharts was a General in the ...",0.0
