# Financial Research Agent - Router -> [web search/RAG/LLM] -> Summarizing
## Please note that this notebook uses the latest way to create and use ReAct agents and OpenAI models using Langchain

### PLease visit https://docs.langchain.com/oss/python/langchain/agents and https://docs.langchain.com/oss/python/integrations/chat/openai for more details

In [None]:
#!pip install -qU langchain langchain_openai langgraph openbb openbb-yfinance

In [309]:
import os
import json
import re
import os
import numpy as np
import json
import requests
from typing import Dict, Any, List, Optional
from typing import TypedDict, Literal
from bs4 import BeautifulSoup

from langchain.tools import tool
from langchain.agents import create_agent
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

from langchain_community.tools.tavily_search import TavilySearchResults

from langgraph.graph import StateGraph, END
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langgraph.graph import StateGraph, START, END
from typing import TypedDict, Annotated, List, Union, Literal
from langchain_core.messages import BaseMessage
from operator import itemgetter
from langchain_core.messages import HumanMessage, AIMessage

import pandas as pd
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters  import RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS

from sentence_transformers import CrossEncoder

In [55]:
tavily_api_key = "<API KEY>"
openai_api_key = "<API KEY>"

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["TAVILY_API_KEY"] = tavily_api_key

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

HEADERS = {"User-Agent": "FinancialAssistant/1.0"}

## RAG Implementation

### The RAG mode will be selected only if the query is related to Netflix and/or Oracle as two PDFs are in the RAG folder processed.

In [57]:
DATA_DIR = "Financial_Docs"

def load_all_pdfs(path):
    docs = []
    for pdf_file in glob.glob(f"{path}/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        pages = loader.load()
        for p in pages:
            p.metadata["source"] = os.path.basename(pdf_file)
        docs.extend(pages)
    return docs

raw_docs = load_all_pdfs(DATA_DIR)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200
)

chunked_docs = splitter.split_documents(raw_docs)

# Attach chunk indices
for idx, doc in enumerate(chunked_docs):
    doc.metadata["chunk_index"] = idx

In [58]:
chunked_docs[:2]

[Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2025-10-22T16:03:23-04:00', 'title': '0001065280-25-000406', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2025-10-22 for the period ending 2025-09-30', 'keywords': '0001065280-25-000406; ; 10-Q', 'moddate': '2025-10-22T16:03:31-04:00', 'source': 'Netflix.pdf', 'total_pages': 42, 'page': 0, 'page_label': '1', 'chunk_index': 0}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended September 30, 2025\nOR\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from                  to                 \nCommission File Number: 001-35727\nNetflix, Inc.\n(Exact name of Reg

In [59]:
emb = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = FAISS.from_documents(chunked_docs, emb)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 12}  
)

In [60]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # "BAAI/bge-reranker-v2-m3" is a bit slow while testing

In [61]:
def rerank_with_crossencoder(query, docs, top_k=5, batch_size=16):
    if not docs:
        return []
    pairs = [[query, d.page_content] for d in docs]
    scores = reranker.predict(pairs, batch_size=batch_size) 
    order = np.argsort(scores)[::-1]
    return [docs[i] for i in order[:top_k]]

In [62]:
def format_docs(docs, max_chars=900):
    blocks = []
    citation_map = {}
    for i, d in enumerate(docs, start=1):
        txt = d.page_content[:max_chars]
        md = d.metadata
        blocks.append(f"[{i}] Source: {md['source']} | chunk={md['chunk_index']} \n{txt}")
        citation_map[i] = {
            "source": md["source"],
            "chunk_index": md["chunk_index"],
            "excerpt": txt
        }
    return "\n\n---\n\n".join(blocks), citation_map


def generate_answer(question, docs):
    context, citation_map = format_docs(docs)
    
    prompt = f"""
You are a factual research assistant. 
Use ONLY the context blocks below to answer. 
Use inline citations like [1][2]. 
If unknown, say "Not answerable from documents."

Context:
{context}

Question: {question}

Answer with citations, then add a "SOURCES" section.
"""
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    output = llm.invoke(prompt).content
    return output, citation_map

In [89]:
def answer_query(question: str) -> str:
    initial_docs = retriever.invoke(question)
    top_docs = rerank_with_crossencoder(question, initial_docs, top_k=5)
        
    answer, sources = generate_answer(question, top_docs)
    return answer + json.dumps(sources)

In [91]:
question = "How is Oracle Cloud Applications (OCA)  performing?"
answer = answer_query(question)
print(answer)

Oracle Cloud Applications (OCA) are positioned as an industry-leading business innovation platform that leverages Oracle Cloud Infrastructure (OCI). The OCA offerings provide a broad suite of modular, next-generation cloud software applications that span all core business functions. This includes solutions like Oracle Fusion Cloud ERP, which aims to improve decision-making and workforce productivity by utilizing a single data and security model with a common user interface [3][4]. 

The comprehensive and flexible deployment models of OCA are seen as a key factor in Oracle's growth strategy, allowing customers to choose options that best meet their specific business needs. This flexibility is a significant differentiator compared to competitors who may offer fewer options and more restrictive deployment models [2]. 

Furthermore, Oracle anticipates continued growth in cloud services and license support expenses, driven by customer demand for enhanced data center capacity and the establi

## Web search - Tavily implementation

In [177]:
WEB_KEYWORDS = [r"\blatest\b", r"\btoday\b", r"\bcurrent\b", r"\brecent\b", r"\bbreaking\b"]
RAG_KEYWORDS = [r"\bpolicy\b", r"\binternal\b", r"\bproduct\b", r"\bmanual\b", r"\bknowledge base\b"]

def get_chat_llm(model: str = "gpt-4o-mini", temperature: float = 0.0) -> ChatOpenAI:
    if not OPENAI_API_KEY:
        raise RuntimeError("OPENAI_API_KEY must be set in environment")
    return ChatOpenAI(model=model, temperature=temperature, api_key=OPENAI_API_KEY)

def extract_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for s in soup(["script", "style", "noscript"]):
        s.decompose()
    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join([l for l in lines if l])
    return text

In [179]:
@tool
def finance_web_search(topic: str) -> str:
    """
    Use TavilySearchResults to find articles and investment blogs for a given
    `topic` (e.g., 'NVIDIA Q3 earnings analysis', 'impact of Fed rate hike on tech stocks').
    It fetches the top pages and summarizes them with a Financial Analyst LLM.
    Returns a plain text aggregation of financial summaries.
    """
    if TAVILY_API_KEY == "YOUR_TAVILY_API_KEY" or not TAVILY_API_KEY:
        return "TAVILY_API_KEY not set; cannot run Tavily search. Please set the environment variable."

    query = f"financial analysis and investment insights on {topic}"

    search = TavilySearchResults(k=5, api_key=TAVILY_API_KEY)
    results = search.run(query)
    urls = [r.get("url") for r in results if "url" in r]

    llm = get_chat_llm(model="gpt-4o-mini", temperature=0)

    summaries: List[str] = []
    for url in urls:
        try:
            resp = requests.get(url, headers=HEADERS, timeout=10)
            resp.raise_for_status()
            text = extract_text(resp.text)
            text = text[:12000]

            prompt = [
                SystemMessage(content="You are a sophisticated Financial Analyst and content summarizer. Produce a crisp summary focused on investment takeaways, risks, and financial implications."),
                HumanMessage(content=f"Summarize the following webpage content for financial and investment insights related to '{topic}':\n\n{text}")
            ]
            
            response = llm.invoke(prompt)
            summary = getattr(response, "content", str(response))
            summaries.append(f"Source URL: {url}\nSummary: {summary}\n")
            
        except Exception as e:
            summaries.append(f"Source URL: {url}\nError summarizing page: {e}\n")

    return "\n".join(summaries)

# Implementation of LangGraph Nodes

In [315]:
def add_messages(left: list, right: list):
    """Reducer function to correctly append new messages to the list."""
    return left + right

class FinancialAgentState(TypedDict, total=False):
    """
    Represents the state of our financial assistant conversation.
    It combines the conversational history with explicit data fields 
    for multi-step financial analysis.
    """
    messages: Annotated[List[Union[HumanMessage, AIMessage]], add_messages]
    query: str         
    topic: str        
    raw_data: str 
    final_report: str
    route: Literal["web_search", "rag", "llm"]

### Refine Query Node - Entry node

In [345]:
def refine_query_node(state: FinancialAgentState) -> FinancialAgentState:
    node_name = "refine_query"

    messages = state.get("messages", [])
    user_query = ""
    for msg in reversed(messages):
        if msg.type == "human":
            user_query = msg.content
            break

    if not user_query:
        output = {"query": "", "topic": ""}
        log_node(node_name, state, output)
        return output

    llm = get_llm(model="gpt-4o-mini", temperature=0)

    refine_system = (
        "You are a financial reasoning assistant.\n"
        "Refine the user's query so downstream components can understand it.\n"
        "Return ONLY the improved query.\n"
        "Do NOT answer the question.\n"
    )

    resp = llm.invoke([
        {"role": "system", "content": refine_system},
        {"role": "user", "content": f"Refine this financial query: {user_query}"}
    ])

    refined = resp.content.strip()

    topic_system = (
        "Extract the main financial topic from the query in 3–6 words.\n"
        "Return ONLY the topic phrase."
    )

    topic_resp = llm.invoke([
        {"role": "system", "content": topic_system},
        {"role": "user", "content": refined}
    ])

    topic = topic_resp.content.strip()

    output = {
        "query": refined,
        "topic": topic
    }

    log_node(node_name, state, output)
    return output


### Router Node

In [319]:
def build_classifier_llm():
    """
    Returns an LLM used only for routing/classification.
    No tools. Pure prompt → output.
    """
    return get_llm(
        model="gpt-4o-mini",
        temperature=0
    )

In [None]:
def router_node(state: FinancialAgentState) -> FinancialAgentState:
    node_name = "router"

    query = state.get("query", "").strip()

    if not query:
        output = {"route": "llm"}
        log_node(node_name, state, output)
        return output

    llm = build_classifier_llm()

    resp = llm.invoke([
        {
            "role": "system",
            "content": (
                "Classify the user's financial query into EXACTLY one category:\n"
                "rag has detailes about the company netflix and oracle, for other companies search the web"
                "If its a question about other companies use websearch"
                "If its just about non financial stuffs then use regular llm"
                "- web_search\n- rag\n- llm\nReturn ONLY one token."
            )
        },
        {"role": "user", "content": query}
    ])

    route = resp.content.strip().lower()

    allowed = {"web_search", "rag", "llm"}
    if route not in allowed:
        route = "llm"

    output = {"route": route}
    log_node(node_name, state, output)
    return output


### Web search, RAN and general LLM node

In [None]:
def web_search_node(state: FinancialAgentState) -> FinancialAgentState:
    node_name = "web_search"

    q = state.get("query") or ""
    evidence = finance_web_search.invoke({"topic": q})
    output = {"evidence": evidence}

    log_node(node_name, state, output)
    return output

def rag_node(state: FinancialAgentState) -> FinancialAgentState:
    node_name = "rag"

    q = state.get("query", "")
    evidence = answer_query(q)
    output = {"evidence": evidence}

    log_node(node_name, state, output)
    return output

def llm_node(state: FinancialAgentState) -> FinancialAgentState:
    node_name = "llm_reason"

    q = state.get("query", "")
    llm = get_llm()

    resp = llm.invoke([
        {"role": "system", "content": "Answer concisely."},
        {"role": "user", "content": q}
    ])

    output = {"evidence": resp.content}
    log_node(node_name, state, output)
    return output

### Final summarizer node

In [367]:
def financial_summarizer_node(state: Dict[str, Any]) -> Dict[str, Any]:
    node_name = "summarizer"

    llm = get_llm(model="gpt-4o")

    topic = state.get("topic", state.get("query", "financial topic"))
    raw_data = state.get("raw_data", state.get("evidence", ""))

    system_instruction = (
        "You are a Chief Investment Strategist..."
    )

    user_instruction = f"""
Write a detailed Financial Analysis Report for: {topic}

--- EVIDENCE ---
{raw_data}
"""

    response = llm.invoke([
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": user_instruction}
    ])

    output = {"final_report": response.content}
    log_node(node_name, state, output)
    return output

### Logging and Creating the flow

In [323]:
def log_node(node_name: str, input_state: str, output_state: str):
    print(f"\n\n===== NODE TRIGGERED: {node_name} =====")
    print(f"--- INPUT STATE ---\n{input_state}")
    print(f"--- OUTPUT STATE ---\n{output_state}")
    print("=====================================\n")

In [347]:
def create_flow():
    """Creates the LangGraph flow for the financial analysis assistant."""
    graph = StateGraph(FinancialAgentState)

    graph.add_node("refine_query", refine_query_node)
    graph.set_entry_point("refine_query")
    
    graph.add_node("router", router_node)
    
    graph.add_node("web_search", web_search_node)
    graph.add_node("rag", rag_node)
    graph.add_node("llm_reason", llm_node)
    
    graph.add_node("summarizer", financial_summarizer_node)

    graph.add_edge("refine_query", "router")
    
    graph.add_conditional_edges(
        "router",
        lambda s: s["route"],
        {
            "web_search": "web_search",
            "rag": "rag",
            "llm": "llm_reason",
        }
    )

    graph.add_edge("web_search", "summarizer")
    graph.add_edge("rag", "summarizer")
    graph.add_edge("llm_reason", "summarizer")

    graph.add_edge("summarizer", END)

    return graph.compile()



In [349]:
flow = create_flow()

## Test Case 1 - Demostrates RAG Route

As you can see in the output section logs
===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and recent news for Netflix and Oracle.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the current valuation metrics and recent news developments for Netflix and Oracle.', 'topic': 'Valuation metrics for Netflix and Oracle', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'rag'}

In [339]:
input_state = {
    "messages": [
        HumanMessage(content="Compare the valuation and recent news for Netflix and Oracle.")
    ],
    "query": "Compare the valuation and recent news for Netflix and Oracle.",
    "topic": "Netflix vs Oracle Stocks",
    "raw_data": "",
}

result = flow.invoke(input_state)

print("\n===== FINANCIAL REPORT =====\n")
print(result.get("final_report") or result)



===== NODE TRIGGERED: refine_query =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and recent news for Netflix and Oracle.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the valuation and recent news for Netflix and Oracle.', 'topic': 'Netflix vs Oracle Stocks', 'raw_data': ''}
--- OUTPUT STATE ---
{'query': 'Compare the current valuation metrics and recent news developments for Netflix and Oracle.', 'topic': 'Valuation metrics for Netflix and Oracle'}



===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and recent news for Netflix and Oracle.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the current valuation metrics and recent news developments for Netflix and Oracle.', 'topic': 'Valuation metrics for Netflix and Oracle', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'rag'}



===== NODE TRIGGERED: rag =====
--- INPUT STATE ---
{'messages': [Hum

## Test Case 2 - Demostrates Websearch Route

As you can see in the output section logs  ===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and most recent news for Google and Tesla.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the current valuation metrics and latest news updates for Google and Tesla.', 'topic': 'Valuation metrics for Google and Tesla', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'web_search'}

In [351]:
input_state = {
    "messages": [
        HumanMessage(content="Compare the valuation and most recent news for Google and Tesla.")
    ],
    "query": "Compare the valuation and recent news for Google and Tesla.",
    "topic": "Google vs Tesla Stocks",
    "raw_data": "",
}

result = flow.invoke(input_state)

print("\n===== FINANCIAL REPORT =====\n")
print(result.get("final_report") or result)



===== NODE TRIGGERED: refine_query =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and most recent news for Google and Tesla.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the valuation and recent news for Google and Tesla.', 'topic': 'Google vs Tesla Stocks', 'raw_data': ''}
--- OUTPUT STATE ---
{'query': 'Compare the current valuation metrics and latest news updates for Google and Tesla.', 'topic': 'Valuation metrics for Google and Tesla'}



===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='Compare the valuation and most recent news for Google and Tesla.', additional_kwargs={}, response_metadata={})], 'query': 'Compare the current valuation metrics and latest news updates for Google and Tesla.', 'topic': 'Valuation metrics for Google and Tesla', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'web_search'}



===== NODE TRIGGERED: web_search =====
--- INPUT STATE ---
{'messages': [Human

## Test Case 3 - Demostrates General LLM Route

As you can see in the ouput section logs
===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='How to decide which stocks to invest in?', additional_kwargs={}, response_metadata={})], 'query': 'What criteria should I use to choose stocks for investment?', 'topic': 'Stock selection criteria for investment', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'llm'}

In [353]:
input_state = {
    "messages": [
        HumanMessage(content="How to decide which stocks to invest in?")
    ],
    "query": "How to decide which stocks to invest in?",
    "topic": "General investment strategies",
    "raw_data": "",
}

result = flow.invoke(input_state)

print("\n===== FINANCIAL REPORT =====\n")
print(result.get("final_report") or result)



===== NODE TRIGGERED: refine_query =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='How to decide which stocks to invest in?', additional_kwargs={}, response_metadata={})], 'query': 'How to decide which stocks to invest in?', 'topic': 'General investment strategies', 'raw_data': ''}
--- OUTPUT STATE ---
{'query': 'What criteria should I use to choose stocks for investment?', 'topic': 'Stock selection criteria for investment'}



===== NODE TRIGGERED: router =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='How to decide which stocks to invest in?', additional_kwargs={}, response_metadata={})], 'query': 'What criteria should I use to choose stocks for investment?', 'topic': 'Stock selection criteria for investment', 'raw_data': ''}
--- OUTPUT STATE ---
{'route': 'llm'}



===== NODE TRIGGERED: llm_reason =====
--- INPUT STATE ---
{'messages': [HumanMessage(content='How to decide which stocks to invest in?', additional_kwargs={}, response_metadata={})], 'query'