Load RAG model from the path ../models/rag_moodel

In [82]:
# --------------- RAG SETUP ------------------
import json, pickle, time, re
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import torch

# ---- Load chunk metadata & indexes (from models/rag_model/) ----
rag_dir = "../models/rag_model"

# Load chunks (from chunks.json)
with open(f"{rag_dir}/chunks.json","r") as f:
    chunks_list = json.load(f)   # list[str]

# Load FAISS index   
faiss_index = faiss.read_index(f"{rag_dir}/faiss_index.bin")

# Load dense embedder
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Build BM25 model (using distilgpt2 tokenizer)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", local_files_only=True)
tokenized_chunks = [tokenizer.tokenize(c.lower()) for c in chunks_list]
normalized_chunks = [[t.lstrip("Ġ") for t in toks] for toks in tokenized_chunks]
bm25_model = BM25Okapi(normalized_chunks)

# Hybrid retrieval functions (copied from RAG notebook)
def preprocess_query(query):
    sw = set(stopwords.words('english'))
    query = re.sub(r'[^a-zA-Z0-9\s]','', query.lower())
    toks = tokenizer.tokenize(query)
    toks = [t.lstrip("Ġ") for t in toks]
    return ' '.join([t for t in toks if t not in sw and len(t) > 2])

def dense_retrieval(query, model, faiss_index, chunks, top_k=5):
    emb = model.encode([query]).astype(np.float32)
    dists, idxs = faiss_index.search(emb, top_k)
    return [{"chunk_id":int(idx), "text":chunks[idx], "score":1/(1+dist)} 
            for dist,idx in zip(dists[0],idxs[0])]

def sparse_retrieval(query, bm25, chunks, top_k=5):
    toks = [t.lstrip("Ġ") for t in tokenizer.tokenize(query.lower())]
    scores = bm25.get_scores(toks)
    top = np.argsort(scores)[::-1][:top_k]
    return [{"chunk_id":int(i), "text":chunks[i], "score":float(scores[i])} for i in top]

def combine_results(dense_r, sparse_r, alpha=0.7):
    combined={}
    if dense_r:
        m=max([r['score'] for r in dense_r]);      [r.update(score=r['score']/m) for r in dense_r]
    if sparse_r:
        m=max([r['score'] for r in sparse_r]);     [r.update(score=r['score']/m) for r in sparse_r]
    for r in dense_r:
        combined[r['chunk_id']]={'text':r['text'],'score':alpha*r['score']}
    for r in sparse_r:
        if r['chunk_id'] in combined:
            combined[r['chunk_id']]['score']+= (1-alpha)*r['score']
        else:
            combined[r['chunk_id']]={'text':r['text'],'score':(1-alpha)*r['score']}
    return sorted(combined.values(), key=lambda x:x['score'], reverse=True)

def hybrid_retrieve(query, top_k=5):
    q = preprocess_query(query)
    d = dense_retrieval(q, embedder, faiss_index, chunks_list, top_k)
    s = sparse_retrieval(q, bm25_model, chunks_list, top_k)
    return combine_results(d, s, alpha=0.7)[:top_k]

print("✅ RAG objects & retrieval functions loaded.")


✅ RAG objects & retrieval functions loaded.


Load the fine-tuned model from the path ../models/fine_tuned_model.

In [83]:
# --------------- LOAD FINE-TUNED MODEL ------------------
ft_model_path = "../models/fine_tuned_model"

ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_path, local_files_only=True)
ft_model     = AutoModelForCausalLM.from_pretrained(ft_model_path, local_files_only=True)

# For RAG we use distilgpt2 as the generator
rag_tokenizer = AutoTokenizer.from_pretrained("distilgpt2", local_files_only=True)
rag_model     = AutoModelForCausalLM.from_pretrained("distilgpt2", local_files_only=True)

print("✅ Fine-tuned and generator models loaded.")


✅ Fine-tuned and generator models loaded.


In [84]:
# Set pad_token to eos_token to avoid padding errors
rag_tokenizer.pad_token = rag_tokenizer.eos_token
ft_tokenizer.pad_token = ft_tokenizer.eos_token

Test Questions

In [85]:
test_questions = [
    # 3 official
    {"question": "What was MR. COOPER GROUP INC.'s revenue in 2023?",
     "ground_truth":   "4781000000.0"},
    {"question": "What was PITNEY BOWES INC's revenue in 2024?",
     "ground_truth":   "3100000000.0"},
    {"question": "What is the capital of France?",
     "ground_truth":  "Data not in scope"},

    # 10 more
    {"question":"What was PRUDENTIAL FINANCIAL INC's revenue in 2023?",
     "ground_truth":"53979000000.0"},
    {"question":"What was KAYNE ANDERSON BDC, INC.'s total liabilities in 2023?",
     "ground_truth":"740610000.0"},
    {"question":"What was CUMBERLAND PHARMACEUTICALS INC's total assets in 2022?",
     "ground_truth":"92925158.0"},
    {"question":"What was STERIS PLC's revenue in 2023?",
     "ground_truth":"241114000.0"},
    {"question":"What was FLUOR CORP's net income in 2022?",
     "ground_truth":"-35332865.0"},
    {"question":"What was AXON ENTERPRISE INC's total liabilities in 2023?",
     "ground_truth":"959000000.0"},
    {"question":"What was ZENTALIS PHARMACEUTICALS INC's net income in 2023?",
     "ground_truth":"571000000.0"},
    {"question":"What was BANK OF AMERICA CORP's revenue in 2023?",
     "ground_truth":"4621000000.0"},
    {"question":"What was VULCAN MATERIALS CO's revenue in 2023?",
     "ground_truth":"18589000000.0"},
    {"question":"What was UNITEDHEALTH GROUP INC's revenue in 2024?",
     "ground_truth":"28560000000.0"}
]


Function to get answer from finetuned model

In [86]:
import torch.nn.functional as F

def answer_with_finetuned(q):
    inputs = ft_tokenizer(q, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = ft_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=50,
            pad_token_id=ft_tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=False
        )

    generated_tokens = outputs.sequences[0][inputs["input_ids"].shape[-1]:]

    probs = [F.softmax(score[0], dim=-1) for score in outputs.scores]

    token_confidences = []
    for prob, token in zip(probs, generated_tokens):
        token_id = token.item()
        if token_id < prob.shape[0]:  # prevent out-of-bounds error
            token_confidences.append(prob[token_id].item())
        else:
            token_confidences.append(0.0)  # fallback in case of mismatch

    avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.0
    generated_text = ft_tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return generated_text.strip(), avg_confidence


Function to get answer from RAG model

In [88]:
def answer_with_rag(q, top_k=5):
    # Retrieve top-k chunks
    top_chunks = hybrid_retrieve(q, top_k=top_k)
    
    # Compute "confidence" as the average of the normalized retrieval scores
    if len(top_chunks) > 0:
        confidence = float(np.mean([c["score"] for c in top_chunks]))
    else:
        confidence = 0.0
    
    # Build context and generate final answer
    context = " ".join([c["text"] for c in top_chunks])
    prompt  = context + "\n\nQuestion: " + q + "\nAnswer:"
    # inp     = rag_tokenizer.encode(prompt, return_tensors="pt")
    inp = rag_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    out = rag_model.generate(
    input_ids=inp["input_ids"],
    attention_mask=inp["attention_mask"],
    max_new_tokens=50,
    pad_token_id=rag_tokenizer.eos_token_id
    )
    # out     = rag_model.generate(inp, max_new_tokens=50)
    answer  = rag_tokenizer.decode(out[0], skip_special_tokens=True)
    
    return answer, confidence

Guardrail

In [89]:

def input_guardrail(question: str) -> bool:
    keywords = ["revenue", "income", "assets", "liabilities", "cash"]
    return any(kw in question.lower() for kw in keywords)


Results

In [90]:


results = []

for item in test_questions:
    q  = item["question"]
    gt = item["ground_truth"]

    # -----------------------------------------------
    #         GUARDRAIL → check question
    # -----------------------------------------------
    if not input_guardrail(q):
        ft_ans  = "[Guardrail] Out-of-scope question"
        rag_ans = "[Guardrail] Out-of-scope question"
        ft_conf = "N/A"
        rag_conf = 0.0
        t_ft, t_rag = 0.0, 0.0
    else:
        # fine-tuned model
        # start = time.time()
        # ft_ans = answer_with_finetuned(q)
        # t_ft = time.time() - start
        # ft_conf = "N/A"
        start = time.time()
        ft_ans, ft_conf = answer_with_finetuned(q)
        t_ft = time.time() - start

        # RAG model
        start = time.time()
        rag_ans, rag_conf = answer_with_rag(q)
        t_rag = time.time() - start

    results.append({"question":q,"method":"Fine-Tuned","answer":ft_ans,"ground_truth":gt,"confidence": ft_conf,"time(s)":round(t_ft,2)})
    results.append({"question":q,"method":"RAG","answer":rag_ans,"ground_truth":gt,"confidence": rag_conf,"time(s)":round(t_rag,2)})

df_results = pd.DataFrame(results)


Save the results in csv and xlxs file.

In [92]:
# Save
df_results.to_excel("../results/evaluation_results.xlsx", index=False)
print("✅ Results saved to ../results/evaluation_results.xlsx")
df_results.to_csv("../results/evaluation_results.csv", index=False)
print("✅ Results saved to ../results/evaluation_results.csv")

✅ Results saved to ../results/evaluation_results.xlsx
✅ Results saved to ../results/evaluation_results.csv


In [93]:

df_results.head()

Unnamed: 0,question,method,answer,ground_truth,confidence,time(s)
0,What was MR. COOPER GROUP INC.'s revenue in 2023?,Fine-Tuned,MR. COOPER GROUP INC. reported revenue of -0.0...,4781000000.0,0.913409,2.53
1,What was MR. COOPER GROUP INC.'s revenue in 2023?,RAG,LAMAR ADVERTISING CO/NEW reported net income o...,4781000000.0,0.69755,8.57
2,What was PITNEY BOWES INC's revenue in 2024?,Fine-Tuned,PITNEY BOWES INC reported revenue of -0.0 in 2...,3100000000.0,0.897528,2.16
3,What was PITNEY BOWES INC's revenue in 2024?,RAG,SEZZLE INC. reported revenue of 109739057.0 in...,3100000000.0,0.694203,7.82
4,What is the capital of France?,Fine-Tuned,[Guardrail] Out-of-scope question,Data not in scope,,0.0
