Import all libraries

In [None]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import re
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

In [None]:
def extract_numeric_answer(query, retrieved_chunks):
    """
    Attempt to extract a numeric answer from retrieved sentences
    by matching relevant financial keywords in the query.
    """
    # Define common financial metric keywords
    keywords = {
        'revenue': ['revenue', 'revenues', 'sales'],
        'net income': ['net income', 'income', 'net earnings', 'profit'],
        'assets': ['total assets', 'assets'],
        'liabilities': ['total liabilities', 'liabilities'],
        'cash': ['cash and cash equivalents', 'cash']
    }

    # Normalize query
    query_lower = query.lower()

    # Find which category the query is targeting
    target_category = None
    for key, terms in keywords.items():
        if any(term in query_lower for term in terms):
            target_category = key
            break

    if not target_category:
        return None  # No match found

    # Search for numbers in relevant retrieved chunks
    for chunk in retrieved_chunks:
        text = chunk['text']
        if any(term in text.lower() for term in keywords[target_category]):
            # Find number in sentence
            match = re.search(r'([\-]?\d[\d,]*\.?\d*)', text)
            if match:
                num_str = match.group(1).replace(',', '')
                try:
                    value = float(num_str)
                    return value
                except:
                    continue

    return None  # No numeric value confidently extracted


Load the tokenizer and the generative model

In [4]:
# Load tokenizer and generative model
device = 0 if torch.cuda.is_available() else -1
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

Device set to use cpu


Load the dataset 

In [5]:
# Load the data
data_path = "../data/processed/financial_sentences_10k.xlsx"
df = pd.read_excel(data_path)
text_col = 'sentence'
sentences = df[text_col].dropna().astype(str).tolist()

Chunking

In [6]:
# ---- Sentence-level Chunking ----
def sentence_chunking(sentence_list):
    return [sent.strip() for sent in sentence_list if len(sent.strip()) > 0]

chunks = sentence_chunking(sentences)

Loading embedding model and the cross-encoder

In [9]:
# ---- Build BM25 + FAISS on sentence-level chunks ----
print("Loading embedding model (E5-small-v2) and the cross encoder model...")
embedding_model = SentenceTransformer('intfloat/e5-small-v2')

# Load cross-encoder for multi-stage reranking
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Loading embedding model (E5-small-v2) and the cross encoder model...


Encoding the chunks 

In [11]:
# BM25 index
tokenized_chunks = [tokenizer.tokenize(chunk.lower()) for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)
# FAISS index
print("Encoding sentence chunks...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
embeddings = np.array(embeddings, dtype=np.float32)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)


Encoding sentence chunks...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Saving the model

In [12]:
# Save indices and chunks for Streamlit usage
output_dir = "../models/rag_model"
os.makedirs(output_dir, exist_ok=True)
faiss.write_index(faiss_index, f"{output_dir}/faiss_index.bin")
with open(f"{output_dir}/bm25_index.pkl", 'wb') as f: pickle.dump(bm25, f)
with open(f"{output_dir}/chunks.json", 'w') as f: json.dump(chunks, f)
np.save(f"{output_dir}/embeddings.npy", embeddings)

Preprocess the query

In [21]:
# ---- Preprocessing ----
def preprocess_query(query: str) -> str:
    stop_words = set(stopwords.words('english'))
    q = re.sub(r'[^a-zA-Z0-9\s]', '', query.lower())
    tokens = tokenizer.tokenize(q)
    tokens = [t.lstrip('Ġ') for t in tokens]
    filtered = [t for t in tokens if t not in stop_words and len(t) > 2]
    return ' '.join(filtered)

Retrieve the chunk based on semantic similarity

In [22]:
# ---- Retrieval ----
def dense_retrieval(query, top_k):
    q_emb = embedding_model.encode([query])
    q_emb = np.array(q_emb, dtype=np.float32)
    dists, ids = faiss_index.search(q_emb, top_k)
    results = []
    for dist, idx in zip(dists[0], ids[0]):
        if idx != -1:
            results.append({'chunk_id': int(idx),'text': chunks[idx],'score': float(1/(1+dist)),'method':'dense'})
    return results

Retrieve the chunk based on cosine similarity

In [23]:
def sparse_retrieval(query, top_k):
    tokens = preprocess_query(query).split()
    scores = bm25.get_scores(tokens)
    top_idxs = np.argsort(scores)[::-1][:top_k]
    results = []
    for i in top_idxs:
        if scores[i] > 0:
            results.append({'chunk_id':int(i),'text': chunks[i],'score':float(scores[i]),'method':'sparse'})
    return results

In [24]:

def combine_results(dense, sparse, alpha=0.7):
    combined = {}
    if dense:
        maxd = max(r['score'] for r in dense)
        for r in dense: r['score'] /= maxd
    if sparse:
        maxs = max(r['score'] for r in sparse)
        for r in sparse: r['score'] /= maxs
    for r in dense: combined[r['chunk_id']]={'chunk_id':r['chunk_id'],'text':r['text'],'score':alpha*r['score'],'method':'dense'}
    for r in sparse:
        if r['chunk_id'] in combined:
            combined[r['chunk_id']]['score']+= (1-alpha)*r['score']
            combined[r['chunk_id']]['method']='hybrid'
        else:
            combined[r['chunk_id']]={'chunk_id':r['chunk_id'],'text':r['text'],'score':(1-alpha)*r['score'],'method':'sparse'}
    return sorted(combined.values(), key=lambda x:x['score'], reverse=True)


Guardrails

In [25]:
# ---- Guardrails ----
class RAGGuardrails:
    def __init__(self):
        self.harmful_patterns=[r'\b(hack|steal|illegal|fraud|scam)\b']
        self.min_query_length=3
    def validate_input(self,q):
        if len(q.strip())<3: return False, 'Query too short'
        return True,'ok'
    def validate_output(self,rtext):
        return True,'ok'

guardrails = RAGGuardrails()

RAG pipeline

In [26]:

# ---- Main Pipeline with metadata filtering ----
def rag_pipeline(query, top_k=5):
    """Main RAG pipeline with multi-stage retrieval"""
    ok,msg = guardrails.validate_input(query)
    if not ok:
        return {'query':query,'answer':msg,'retrieved_chunks':[], 'confidence':0.0,'method':'validation'}

    q_lower = query.lower()
    company_candidates = [name for name in df['name'].unique() if name.lower() in q_lower]
    year_candidates = re.findall(r'20\d{2}', query)
    filtered_df = df.copy()
    if company_candidates:
        filtered_df = filtered_df[filtered_df['name'].str.lower().isin([c.lower() for c in company_candidates])]
    if year_candidates:
        yr = year_candidates[0]
        filtered_df = filtered_df[ filtered_df['ddate'].astype(str).str.startswith(yr) ]
    candidate_sentences = filtered_df[text_col].dropna().astype(str).tolist()
    search_chunks = sentence_chunking(candidate_sentences) if candidate_sentences else chunks

    processed = preprocess_query(query)
    dense = dense_retrieval(processed, top_k)
    sparse = sparse_retrieval(processed, top_k)
    # Stage1 hybrid retrieval
    stage1 = combine_results(dense, sparse)[:20]
    # Stage2 reranking using cross-encoder
    texts_for_rerank = [r['text'] for r in stage1]
    cross_scores = cross_encoder.predict([(query, t) for t in texts_for_rerank])
    for i,r in enumerate(stage1): r['final_score'] = 0.5*r['score'] + 0.5*float(cross_scores[i])
    retrieved = sorted(stage1, key=lambda x: x['final_score'], reverse=True)[:top_k]


    context = " ".join([c['text'] for c in retrieved])
    prompt = f"Answer the question based on the context below.\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"

    numeric_answer = extract_numeric_answer(query, retrieved)

    if numeric_answer is not None:
        answer = f"{numeric_answer: ,.0f}"
    else:
        # fallback LM answer
        if context.strip():  # only call generator if there is context
            resp = generator(prompt, max_new_tokens=80, do_sample=False)
            raw_text = resp[0]['generated_text']
            answer = raw_text.split('Answer:')[-1].split('\n')[0].strip()
        else:
            answer = "No relevant information found in the data."

    return {
        'query': query,
        'answer': answer,
        'retrieved_chunks': retrieved,
        'confidence': np.mean([r['score'] for r in retrieved]) if retrieved else 0.0,
        'method': 'rag_numeric'
    }



In [28]:
# --------------------------
# Quick sanity check
# --------------------------
if __name__ == "__main__":
    sample_queries = [
        "What was MR. COOPER GROUP INC's revenue in 2023?",
        "How much net income did PITNEY BOWES INC report in 2022?",
        "What were the total assets of CMS ENERGY CORP in 2022?",
        "What is the capital of France?"
    ]

    for q in sample_queries:
        result = rag_pipeline(q, top_k=5)
        print("\nQuery:", q)
        print("Answer:", result['answer'])
        print("Confidence:", round(result['confidence'],3))



Query: What was MR. COOPER GROUP INC's revenue in 2023?
Answer:  2,464,000,000
Confidence: 0.277

Query: How much net income did PITNEY BOWES INC report in 2022?
Answer: -6000.0
Confidence: 0.28

Query: What were the total assets of CMS ENERGY CORP in 2022?
Answer:  1,102,000,000
Confidence: 0.279

Query: What is the capital of France?
Answer: ALTISOURCE PORTFOLIO SOLUTIONS S.A.
Confidence: 0.697
