## RAG Core Logic and Evaluation


### Load FAISS index

In [3]:
import faiss

index = faiss.read_index("../Data/processed/faiss_index/index.faiss")


In [4]:
print(f"FAISS index size: {index.ntotal}")

FAISS index size: 40752


### Load sampled metadata only

In [5]:
import pandas as pd

df = pd.read_csv("../Data/processed/sampled_complaints.csv")


In [6]:
print(f"df shape: {df.shape}")
print("Columns:", df.columns.tolist())
print(df.head())

df shape: (12000, 20)
Columns: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID', 'narrative_word_count', 'cleaned_narrative']
  Date received      Product                                 Sub-product  \
0    2024-12-22  Credit card                           Store credit card   
1    2016-11-11  Credit card                                         NaN   
2    2016-12-23  Credit card                                         NaN   
3    2024-08-30  Credit card  General-purpose credit card or charge card   
4    2024-07-13  Credit card  General-purpose credit card or charge card   

                                             Issue  \
0                            Getting a credit card   
1                             APR

### Embed the user question

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def truncate_to_tokens(text, max_tokens=400):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)



  from .autonotebook import tqdm as notebook_tqdm
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /google/flan-t5-base/resolve/main/tokenizer_config.json (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 07e7d672-f575-44d8-8bb0-59699f7904c1)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /google/flan-t5-base/resolve/main/tokenizer_config.json (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 5124389d-a369-47a2-92b6-8102b9a92557)')' thrown while requesting HEAD https://huggingface.co/google/fl

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_question(question: str):
    if not question or not question.strip():
        raise ValueError("Question cannot be empty")
    return embedder.encode([question], normalize_embeddings=True)

### Retrieve top-k chunks from FAISS

In [8]:
def retrieve_chunks(question: str, k: int = 5):
    query_vec = embed_question(question)
    distances, indices = index.search(query_vec, k)

    import json
    with open("../Data/processed/faiss_index/index_meta.json") as f:
        meta = json.load(f)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        m = meta[idx]
        complaint_id = m['complaint_id']
        row = df.iloc[complaint_id].copy()
        row['similarity_score'] = dist
        results.append(row)
    return pd.DataFrame(results)

### Generator 

In [9]:
from transformers import pipeline

def load_llm():
    return pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        max_new_tokens=200
    )



In [10]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join("..")))

from src.RAG.prompt import PROMPT_TEMPLATE

llm = load_llm()

Device set to use cpu


### pipeline 

In [14]:
def run_rag(question, k=5):
    retrieved = retrieve_chunks(question, k)

    context = "\n\n".join(
        retrieved["Consumer complaint narrative"].head(3)
    )

    prompt = PROMPT_TEMPLATE.format(
        context=context,
        question=question
    )

    response = llm(prompt)[0]["generated_text"]

    return {
        "answer": response,
        "sources": retrieved[
            ["Product", "Issue", "Company"]
        ].head(2)
    }

### Evaluation 

In [13]:
questions = [
    "Why are customers unhappy with credit cards?",
    "What issues are common in money transfers?",
    "Are savings accounts causing complaints?"
]

results = []

for q in questions:
    output = run_rag(q)
    results.append({
        "Question": q,
        "Generated Answer": output["answer"],
        "Retrieved Sources": str(output["sources"].to_dict()),
        "Quality Score": "",
        "Comments": ""
    })

pd.DataFrame(results)


NameError: name 'embed_question' is not defined