In [None]:
!pip -q install chromadb sentence-transformers pypdf langchain-text-splitters openai tiktoken tavily-python gradio

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m8.0 MB/s[0m eta [36m0:0

In [None]:

import os

# ⬇️ Paste your keys here (or set in the environment beforehand)
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.environ.get("TAVILY_API_KEY")

# Choose an OpenAI model (good defaults: gpt-4o-mini, gpt-4o)
os.environ["OPENAI_MODEL"] = os.environ.get("OPENAI_MODEL") or "gpt-4o-mini"

print("OPENAI_MODEL:", os.environ["OPENAI_MODEL"])
print("OPENAI_API_KEY set?", bool(os.environ.get("OPENAI_API_KEY") and "YOUR_OPENAI" not in os.environ["OPENAI_API_KEY"]))
print("TAVILY_API_KEY set?", bool(os.environ.get("TAVILY_API_KEY") and "YOUR_TAVILY" not in os.environ["TAVILY_API_KEY"]))

OPENAI_MODEL: gpt-4o-mini
OPENAI_API_KEY set? True
TAVILY_API_KEY set? True


In [None]:
import os, pathlib

DATA_DIR = "data"
DB_DIR = "db"
pathlib.Path(DATA_DIR).mkdir(exist_ok=True, parents=True)
pathlib.Path(DB_DIR).mkdir(exist_ok=True, parents=True)

print("Folders ready:", DATA_DIR, DB_DIR)

Folders ready: data db


In [None]:
import subprocess, sys, os

samples = [
  # OCC Comptroller’s Handbook (Credit Card Lending) – policy background
  ("OCC_Credit_Card_Lending_Handbook.pdf",
   "https://www.occ.treas.gov/publications-and-resources/publications/comptrollers-handbook/files/credit-card-lending/pub-ch-credit-card.pdf"),
  # Chase cardmember agreement example (rates & fees)
  ("Chase_Cardmember_Rates_Fees.pdf",
   "https://www.chase.com/content/feed/public/creditcards/cma/Chase/COL00094.pdf"),
  # Capital One consumer card agreement
  ("CapitalOne_Consumer_Card_Agreement.pdf",
   "https://ecm.capitalone.com/WCM/card/credit-card-agreement-for-consumer-cards-in-capital-one-bank-usa-na.pdf"),
]

for name, url in samples:
    out = os.path.join(DATA_DIR, name)
    if not os.path.exists(out):
        print("Downloading:", name)
        try:
            subprocess.run(["curl","-L","-o", out, url], check=True)
        except Exception as e:
            print("Failed to download", name, "->", e)
    else:
        print("Already exists:", name)

print("Done. You can also upload PDFs manually into", DATA_DIR)

Downloading: OCC_Credit_Card_Lending_Handbook.pdf
Downloading: Chase_Cardmember_Rates_Fees.pdf
Downloading: CapitalOne_Consumer_Card_Agreement.pdf
Done. You can also upload PDFs manually into data


In [None]:
# Ingest: read PDFs, split, embed, index
import os, glob
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions

DOC_DIR = DATA_DIR
COLL_NAME = "finance_policies"

def extract_pages(pdf_path):
    reader = PdfReader(pdf_path)
    for i, page in enumerate(reader.pages):
        text = (page.extract_text() or "").strip()
        if text:
            yield i, text

def ingest_pdfs():
    client = chromadb.PersistentClient(path=DB_DIR)
    embed = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    coll = client.get_or_create_collection(
        name=COLL_NAME, embedding_function=embed, metadata={"hnsw:space": "cosine"}
    )

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=900, chunk_overlap=150, separators=["\n\n", "\n", " ", ""]
    )

    ids, docs, metas = [], [], []
    pdfs = glob.glob(os.path.join(DOC_DIR, "*.pdf"))
    for pdf in pdfs:
        base = os.path.basename(pdf)
        for page_no, text in extract_pages(pdf):
            for ci, chunk in enumerate(splitter.split_text(text)):
                ids.append(f"{base}::p{page_no}::c{ci}")
                docs.append(chunk)
                metas.append({"source": base, "page": page_no})

    if docs:
        coll.upsert(ids=ids, documents=docs, metadatas=metas)
        print(f"Indexed {len(docs)} chunks from {len(set(m['source'] for m in metas))} PDFs.")
    else:
        print("No text found—ensure PDFs are text-based or OCR them first.")

ingest_pdfs()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexed 843 chunks from 3 PDFs.


In [None]:

from typing import List, Dict, Any

import chromadb
from chromadb.utils import embedding_functions

def rag_search(query: str, k: int = 5) -> List[Dict[str, Any]]:
    client = chromadb.PersistentClient(path=DB_DIR)
    embed = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    coll = client.get_or_create_collection(
        name=COLL_NAME, embedding_function=embed, metadata={"hnsw:space":"cosine"}
    )
    res = coll.query(
        query_texts=[query], n_results=k, include=["documents","metadatas","distances"]
    )
    docs = res.get("documents", [[]])[0]
    metas = res.get("metadatas", [[]])[0]
    dists = res.get("distances", [[]])[0]
    out = []
    for doc, meta, dist in zip(docs, metas, dists):
        sim = 1.0 - float(dist)  # cosine → similarity intuition
        out.append({
            "text": doc,
            "source": meta.get("source","unknown.pdf"),
            "page": meta.get("page", None),
            "distance": float(dist),
            "similarity": sim
        })
    return out

def web_search(query: str, max_results: int = 5) -> Dict[str, Any]:
    # Requires TAVILY_API_KEY
    try:
        from tavily import TavilyClient
        t = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
        r = t.search(query, search_depth="basic", include_answer=True, max_results=max_results)
        urls = [it.get("url") for it in (r.get("results") or []) if it.get("url")]
        return {"summary": r.get("answer") or "", "urls": urls[:max_results]}
    except Exception as e:
        return {"summary": f"(Web search unavailable: {e})", "urls": []}

# quick smoke test:
print("RAG test hits (count may be 0 if PDFs not yet downloaded):", len(rag_search("APR fee", k=3)))

RAG test hits (count may be 0 if PDFs not yet downloaded): 3


In [None]:
import json, os
from typing import List, Dict, Any
from openai import OpenAI

OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")

ROUTER_SYSTEM = (
    "You are a routing agent for a finance QA system. Decide whether to use LOCAL_PDF or WEB.\n"
    "Consider the question and the top local PDF retrieval hits (snippets + similarity).\n"
    "Rules:\n"
    "- Prefer LOCAL_PDF when there are strong and directly relevant matches (similarity >= 0.70) and at least 2 hits.\n"
    "- Use WEB when local matches are weak, off-topic, or the question is general/industry-wide without product/policy specifics.\n"
    "- Output strict JSON with keys: route ('pdf'|'web'), reason (short), confidence (0..1)."
)

def router_decide(question: str, pdf_hits: List[Dict[str, Any]]) -> Dict[str, Any]:
    strong_hits = [h for h in pdf_hits if h.get("similarity", 0.0) >= 0.70]
    if len(strong_hits) >= 2:
        return {"route":"pdf","reason":"Strong local matches (>=0.70) and enough hits.","confidence":0.85}

    # Build a compact preview for the LLM
    preview = []
    for h in pdf_hits[:5]:
        preview.append({
            "source": f"{h.get('source','')} p{h.get('page')}",
            "similarity": round(float(h.get("similarity",0.0)),3),
            "text": (h.get("text","")[:400] + "…") if len(h.get("text",""))>420 else h.get("text","")
        })
    user_msg = (
        f"Question:\n{question}\n\n"
        f"Top PDF hits (preview):\n{json.dumps(preview, ensure_ascii=False, indent=2)}\n\n"
        "Respond ONLY with JSON: {\"route\":\"pdf|web\",\"reason\":\"...\",\"confidence\":0.0}"
    )

    client = OpenAI()
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[{"role":"system","content":ROUTER_SYSTEM},
                  {"role":"user","content":user_msg}],
        temperature=0.0
    )
    content = resp.choices[0].message.content.strip()
    try:
        data = json.loads(content)
        route = data.get("route","web")
        reason = data.get("reason","")
        conf = float(data.get("confidence",0.6))
        if route not in ("pdf","web"):
            route = "web"
        return {"route": route, "reason": reason, "confidence": conf}
    except Exception:
        return {"route":"web","reason":"Failed to parse router output; defaulting to web.","confidence":0.5}

# quick router smoke test (won't call LLM unless needed)
test_hits = rag_search("late payment fee", k=5)
print("Router sample decision (may say web if no strong hits yet):", router_decide("What is the late payment fee?", test_hits))

Router sample decision (may say web if no strong hits yet): {'route': 'web', 'reason': 'Local matches are weak and do not provide a clear definition of late payment fees.', 'confidence': 0.6}


In [None]:
from typing import Tuple, List, Dict, Any
from openai import OpenAI

USE_WEB = True  # set False for PDF-only

def _build_pdf_context(snips: List[Dict[str,Any]]) -> str:
    blocks = []
    for s in snips:
        tag = f"{s['source']} p{s['page']}"
        text = s['text']
        blocks.append(f"[{tag}]\n{text}")
    return "\n\n---\n\n".join(blocks)

def _pdf_sources(snips: List[Dict[str,Any]]) -> str:
    seen, out = set(), []
    for s in snips:
        tag = f"{s['source']} p{s['page']}"
        if tag not in seen:
            out.append(f"- {tag}")
            seen.add(tag)
    return "\n".join(out) if out else "- (none)"

def _web_sources(urls: List[str]) -> str:
    return "\n".join(f"- {u}" for u in urls) if urls else "- (none)"

def _llm_pdf(question: str, ctx: str) -> str:
    client = OpenAI()
    sys = ("You are a finance policy assistant. Answer ONLY from the PDF context. "
           "If insufficient, say you don't know. Be concise.")
    user = f"Question: {question}\n\nPDF Context:\n{ctx}\n\nAnswer:"
    r = client.chat.completions.create(
        model=os.environ.get("OPENAI_MODEL","gpt-4o-mini"),
        messages=[{"role":"system","content":sys},{"role":"user","content":user}],
        temperature=0.2
    )
    return r.choices[0].message.content.strip()

def _llm_web(question: str, web_summary: str) -> str:
    client = OpenAI()
    sys = ("You are a finance assistant. Use only the provided web summary. "
           "If uncertain, say you don't know. Be concise.")
    user = f"Question: {question}\n\nWeb Summary:\n{web_summary}\n\nAnswer:"
    r = client.chat.completions.create(
        model=os.environ.get("OPENAI_MODEL","gpt-4o-mini"),
        messages=[{"role":"system","content":sys},{"role":"user","content":user}],
        temperature=0.2
    )
    return r.choices[0].message.content.strip()

def ask(question: str, top_k: int = 5) -> str:
    # Retrieve from PDFs
    pdf_hits = rag_search(question, k=top_k)
    # Route
    decision = router_decide(question, pdf_hits)
    route, reason, conf = decision["route"], decision["reason"], decision["confidence"]

    if route == "pdf" and pdf_hits:
        selected = sorted(pdf_hits, key=lambda x: x["similarity"], reverse=True)[:top_k]
        ctx = _build_pdf_context(selected)
        text = _llm_pdf(question, ctx)
        return (
            f"{text}\n\n**Route:** PDF (confidence {conf:.2f}) — {reason}\n"
            f"**Sources (PDF):**\n{_pdf_sources(selected)}"
        )

    if route == "web" and USE_WEB:
        wr = web_search(question)
        text = _llm_web(question, wr.get("summary",""))
        return (
            f"{text}\n\n**Route:** Web (confidence {conf:.2f}) — {reason}\n"
            f"**Sources (Web):**\n{_web_sources(wr.get('urls', []))}"
        )

    return (
        "I couldn't find this in the PDFs and web fallback is disabled.\n\n"
        f"**Route:** {route} (confidence {conf:.2f}) — {reason}"
    )

In [None]:

# Example queries — edit these
questions = [
    "What is the late payment fee for the card?",
    "How is APR calculated for balance transfers?",
    "What is AML and why is it important?",  # likely to go to web if not in your PDFs
    "what are Risks Associated With Credit Card Lending"
]

for q in questions:
    print("Q:", q)
    print(ask(q))
    print("\n" + "="*80 + "\n")

Q: What is the late payment fee for the card?
The late payment fee for the card is capped at $8.

**Route:** Web (confidence 0.50) — Local matches are weak and not directly relevant to the specific question about late payment fees.
**Sources (Web):**
- https://www.chase.com/personal/credit-cards/education/basics/credit-card-late-fees-explained
- https://files.consumerfinance.gov/f/documents/cfpb_credit-card-late-fees_report_2022-03.pdf
- https://www.cnbc.com/select/credit-card-late-fees-new-cap/
- https://www.consumerfinance.gov/about-us/newsroom/cfpb-bans-excessive-credit-card-late-fees-lowers-typical-fee-from-32-to-8/
- https://www.reddit.com/r/CreditCards/comments/1fw3inj/credit_card_late_fees_what_is_going_on/


Q: How is APR calculated for balance transfers?
APR for balance transfers is calculated using the Average Daily Balance method, which includes new transactions. The daily balance is determined by taking the beginning balance, adding any new transactions and interest charges

In [None]:
import gradio as gr

def gradio_qa(q):
    try:
        return ask(q)
    except Exception as e:
        return f"Error: {e}"

demo = gr.Interface(
    fn=gradio_qa,
    inputs=gr.Textbox(label="Ask a finance question"),
    outputs=gr.Markdown(label="Answer"),
    title="PDF-first RAG with Router + Web Fallback",
    description="Search PDFs first; route to web only if local context is weak or missing."
)

# Uncomment to launch (Colab will display the app inline)
demo.launch(share=False)
print("Gradio app is ready. Uncomment demo.launch(...) to run it.")

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Gradio app is ready. Uncomment demo.launch(...) to run it.
