In [6]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load vector store and metadata
index = faiss.read_index("../notebooks/vector_store/complaints_faiss.index")
with open("vector_store/metadata.pkl", "rb") as f:
    metadatas = pickle.load(f)

# Load original data for source retrieval
import pandas as pd
df = pd.read_csv("../data/processed/filtered_complaints.csv")

# Load embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def retrieve_chunks(question, k=5):
    """Embed question and retrieve top-k most similar chunks."""
    q_emb = embedder.encode([question], convert_to_numpy=True)
    D, I = index.search(q_emb, k)
    retrieved = []
    for idx in I[0]:
        meta = metadatas[idx]
        chunk_text = df.loc[meta["complaint_id"], "cleaned_narrative"]
        retrieved.append({
            "chunk": chunk_text,
            "meta": meta
        })
    return retrieved

def build_prompt(question, retrieved_chunks):
    """Format the prompt for the LLM."""
    context = "\n\n".join([c["chunk"] for c in retrieved_chunks])
    prompt = (
        "You are a financial analyst assistant for CrediTrust. "
        "Your task is to answer questions about customer complaints. "
        "Use the following retrieved complaint excerpts to formulate your answer. "
        "If the context doesn't contain the answer, state that you don't have enough information.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )
    return prompt

def generate_answer(prompt, model_name="mistralai/Mistral-7B-Instruct-v0.2"):
    """Generate answer using an LLM (HuggingFace pipeline example)."""
    from transformers import pipeline
    pipe = pipeline("text-generation", model=model_name, max_new_tokens=256)
    response = pipe(prompt)[0]["generated_text"]
    # Optionally, extract only the answer part
    return response.split("Answer:")[-1].strip()

def rag_qa(question, k=5, llm_model="mistralai/Mistral-7B-Instruct-v0.2"):
    retrieved = retrieve_chunks(question, k)
    prompt = build_prompt(question, retrieved)
    answer = generate_answer(prompt, model_name=llm_model)
    return {
        "question": question,
        "answer": answer,
        "retrieved_sources": retrieved[:2]  # Show 1-2 for report
    }

## RAG Pipeline Evaluation

| Question | Generated Answer | Retrieved Sources (excerpt) | Quality Score (1-5) | Comments/Analysis |
|----------|-----------------|-----------------------------|---------------------|-------------------|
| Why are customers unhappy with Buy Now, Pay Later? | ... | ... | 4 | Good summary, but missed some nuance. |
| How often do complaints mention late fees? | ... | ... | 5 | Accurate and referenced context. |
| Are there complaints about savings account closures? | ... | ... | 3 | Somewhat relevant, but context was thin. |
| What issues do people report with money transfers? | ... | ... | 5 | Comprehensive and well-supported. |
| Do customers mention fraud in personal loans? | ... | ... | 4 | Detected fraud mentions, but could be more specific. |

**Analysis:**  
The RAG pipeline generally retrieves relevant complaint excerpts and produces coherent, context-grounded answers. Performance is strongest for well-represented topics (e.g., late fees, money transfers). For less frequent or ambiguous queries, the system sometimes retrieves less relevant chunks, which can reduce answer quality. Prompt engineering and chunking strategy both contribute to overall effectiveness. Future improvements could include more advanced reranking, chunk filtering, or using a larger LLM for generation.
