In [None]:
#!pip install -qU langchain langchain_openai langgraph openbb openbb-yfinance

In [53]:
import os
import json
import re
import os
import numpy as np
import json
import requests
from typing import Dict, Any, List, Optional
from typing import TypedDict, Literal
from bs4 import BeautifulSoup

# LangChain imports
from langchain.tools import tool
from langchain.agents import create_agent  # agent builder used in ref
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Tavily search (community tool)
from langchain_community.tools.tavily_search import TavilySearchResults

# LangGraph imports (StateGraph-based flow)
from langgraph.graph import StateGraph, END

# Chat interface used in your reference
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langgraph.graph import StateGraph, START, END

import pandas as pd
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters  import RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS

from sentence_transformers import CrossEncoder

In [55]:

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["TAVILY_API_KEY"] = tavily_api_key

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

HEADERS = {"User-Agent": "FinancialAssistant/1.0"}

In [57]:
DATA_DIR = "Financial_Docs"

def load_all_pdfs(path):
    docs = []
    for pdf_file in glob.glob(f"{path}/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        pages = loader.load()
        for p in pages:
            p.metadata["source"] = os.path.basename(pdf_file)
        docs.extend(pages)
    return docs

raw_docs = load_all_pdfs(DATA_DIR)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200
)

chunked_docs = splitter.split_documents(raw_docs)

# Attach chunk indices
for idx, doc in enumerate(chunked_docs):
    doc.metadata["chunk_index"] = idx

In [58]:
chunked_docs[:2]

[Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2025-10-22T16:03:23-04:00', 'title': '0001065280-25-000406', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2025-10-22 for the period ending 2025-09-30', 'keywords': '0001065280-25-000406; ; 10-Q', 'moddate': '2025-10-22T16:03:31-04:00', 'source': 'Netflix.pdf', 'total_pages': 42, 'page': 0, 'page_label': '1', 'chunk_index': 0}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended September 30, 2025\nOR\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from                  to                 \nCommission File Number: 001-35727\nNetflix, Inc.\n(Exact name of Reg

In [59]:
emb = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = FAISS.from_documents(chunked_docs, emb)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 12}  
)

In [60]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # "BAAI/bge-reranker-v2-m3" is a bit slow while testing

In [61]:
def rerank_with_crossencoder(query, docs, top_k=5, batch_size=16):
    if not docs:
        return []
    pairs = [[query, d.page_content] for d in docs]
    scores = reranker.predict(pairs, batch_size=batch_size) 
    order = np.argsort(scores)[::-1]
    return [docs[i] for i in order[:top_k]]

In [62]:
def format_docs(docs, max_chars=900):
    blocks = []
    citation_map = {}
    for i, d in enumerate(docs, start=1):
        txt = d.page_content[:max_chars]
        md = d.metadata
        blocks.append(f"[{i}] Source: {md['source']} | chunk={md['chunk_index']} \n{txt}")
        citation_map[i] = {
            "source": md["source"],
            "chunk_index": md["chunk_index"],
            "excerpt": txt
        }
    return "\n\n---\n\n".join(blocks), citation_map


def generate_answer(question, docs):
    context, citation_map = format_docs(docs)
    
    prompt = f"""
You are a factual research assistant. 
Use ONLY the context blocks below to answer. 
Use inline citations like [1][2]. 
If unknown, say "Not answerable from documents."

Context:
{context}

Question: {question}

Answer with citations, then add a "SOURCES" section.
"""
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    output = llm.invoke(prompt).content
    return output, citation_map

In [89]:
def answer_query(question: str) -> str:
    initial_docs = retriever.invoke(question)
    top_docs = rerank_with_crossencoder(question, initial_docs, top_k=5)
        
    answer, sources = generate_answer(question, top_docs)
    return answer + json.dumps(sources)

In [91]:
question = "How is Oracle Cloud Applications (OCA)  performing?"
answer = answer_query(question)
print(answer)

Oracle Cloud Applications (OCA) are positioned as an industry-leading business innovation platform that leverages Oracle Cloud Infrastructure (OCI). The OCA offerings provide a broad suite of modular, next-generation cloud software applications that span all core business functions. This includes solutions like Oracle Fusion Cloud ERP, which aims to improve decision-making and workforce productivity by utilizing a single data and security model with a common user interface [3][4]. 

The comprehensive and flexible deployment models of OCA are seen as a key factor in Oracle's growth strategy, allowing customers to choose options that best meet their specific business needs. This flexibility is a significant differentiator compared to competitors who may offer fewer options and more restrictive deployment models [2]. 

Furthermore, Oracle anticipates continued growth in cloud services and license support expenses, driven by customer demand for enhanced data center capacity and the establi

In [175]:
@tool
def rag_search(query: str) -> str:
    """
    Has details of latest quartely and annual performance of Netflix and Oracle.
    RAG retrieval tool. Expects a FAISS vectorstore saved at ./faiss_store or built at runtime.
    Returns top-k documents joined as a text blob.
    """
    try:
        answer = answer_query(query)
    except Exception as e:
        return f"Error in rag_search: {e}"

In [177]:
# Router keywords
WEB_KEYWORDS = [r"\blatest\b", r"\btoday\b", r"\bcurrent\b", r"\brecent\b", r"\bbreaking\b"]
RAG_KEYWORDS = [r"\bpolicy\b", r"\binternal\b", r"\bproduct\b", r"\bmanual\b", r"\bknowledge base\b"]

# LLM factory (ChatOpenAI wrapper used in your snippet)
def get_chat_llm(model: str = "gpt-4o-mini", temperature: float = 0.0) -> ChatOpenAI:
    if not OPENAI_API_KEY:
        raise RuntimeError("OPENAI_API_KEY must be set in environment")
    return ChatOpenAI(model=model, temperature=temperature, api_key=OPENAI_API_KEY)

# Lightweight HTML text extractor
def extract_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    # remove script/style
    for s in soup(["script", "style", "noscript"]):
        s.decompose()
    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join([l for l in lines if l])
    return text

In [179]:
@tool
def finance_web_search(topic: str) -> str:
    """
    Use TavilySearchResults to find articles and investment blogs for a given
    `topic` (e.g., 'NVIDIA Q3 earnings analysis', 'impact of Fed rate hike on tech stocks').
    It fetches the top pages and summarizes them with a Financial Analyst LLM.
    Returns a plain text aggregation of financial summaries.
    """
    if TAVILY_API_KEY == "YOUR_TAVILY_API_KEY" or not TAVILY_API_KEY:
        return "TAVILY_API_KEY not set; cannot run Tavily search. Please set the environment variable."

    # Build search query: Directly use the topic, adding context for better results
    query = f"financial analysis and investment insights on {topic}"

    # Use the Tavily tool wrapper
    search = TavilySearchResults(k=5, api_key=TAVILY_API_KEY)
    results = search.run(query)
    urls = [r.get("url") for r in results if "url" in r]

    llm = get_chat_llm(model="gpt-4o-mini", temperature=0)

    summaries: List[str] = []
    for url in urls:
        try:
            resp = requests.get(url, headers=HEADERS, timeout=10)
            resp.raise_for_status()
            text = extract_text(resp.text)
            # trim to reasonable length for LLM
            text = text[:12000]

            prompt = [
                SystemMessage(content="You are a sophisticated Financial Analyst and content summarizer. Produce a crisp summary focused on investment takeaways, risks, and financial implications."),
                HumanMessage(content=f"Summarize the following webpage content for financial and investment insights related to '{topic}':\n\n{text}")
            ]
            
            response = llm.invoke(prompt)
            # The ChatOpenAI response object typically has a 'content' attribute
            summary = getattr(response, "content", str(response))
            summaries.append(f"Source URL: {url}\nSummary: {summary}\n")
            
        except Exception as e:
            summaries.append(f"Source URL: {url}\nError summarizing page: {e}\n")

    return "\n".join(summaries)

In [180]:
def router_node(state: Dict[str, Any]) -> Dict[str, Any]:
    """
    Router node: performs a quick heuristic check, then falls back to LLM.
    Output: {"route": "web_search" | "rag" | "llm"}
    """
    llm = get_chat_llm(model="gpt-4o-mini", temperature=0)

    prompt = [
        {
            "role": "system",
            "content": (
                "Classify the user query into one routing category. "
                "Reply with exactly ONE of: web_search, rag, llm. "
                "No explanation."
            )
        },
        {"role": "user", "content": query},
    ]

    try:
        resp = llm.invoke(prompt)
        output = getattr(resp, "content", "").strip().lower()

        # sanitize LLM output
        if "web_search" in output:
            return {"route": "web_search"}
        if "rag" in output:
            return {"route": "rag"}
        if "llm" in output:
            return {"route": "llm"}

    except Exception:
        pass

    return {"route": "llm"}

In [256]:
def financial_summarizer_node(state: Dict[str, Any]) -> Dict[str, Any]:
    """
    Synthesize the final financial analysis report from evidence present in the state.
    Expects state to have:
      - topic (the stock ticker(s) or financial question)
      - route (data source, e.g., 'OpenBB', 'WebSearch')
      - evidence (string or list of source data/summaries)
    
    Returns {"final_report": "<text>"}
    """
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    
    # Updated variables for financial context
    topic = state.get("topic", "the requested financial subject")
    route = state.get("route", "LLM reasoning")
    evidence = state.get("evidence", "")
    

    # normalize evidence into single text blob
    if isinstance(evidence, list):
        ev_text = "\n\n".join(evidence)
    else:
        ev_text = str(evidence)

    prompt = [
        {"role": "system", "content": "You are a Chief Investment Strategist and expert financial report writer. Your analysis must be detailed, objective, and actionable. Produce executive-level investment summaries."},
        {"role": "user",
         "content": f"""
Write a comprehensive **Financial Analysis and Investment Report** for the topic: **{topic}**. 
Base your analysis ONLY on the raw data and insights provided in the EVIDENCE section below.

If the topic involves a comparison (e.g., NVIDIA vs. INTEL), dedicate a section to the comparative analysis.
If the evidence includes a clear conclusion (like a "better choice"), explicitly state the rationale.

--- EVIDENCE (from {route}) ---
{ev_text}

Output format:
**Executive Summary and Investment Thesis**
- (Start with a 1-2 sentence recommendation or high-level conclusion.)

**Key Financial Metrics Analysis**
- (Analyze P/E ratio, market cap, growth rates, or other fundamental data from the evidence.)

**Comparative Analysis (If Applicable)**
- (If multiple stocks are involved, compare them directly based on the metrics you analyzed.)

**Recent Market Insights and News**
- (Summarize any external data/news collected.)

**Risk Factors and Opportunities**
- (Based on the evidence, list the main risks and potential growth opportunities.)

**Final Investment Outlook**
- (Provide a clear, detailed final recommendation on the action to take (e.g., Buy, Hold, Sell) or the final answer to the user's specific question.)
"""}
    ]

    response = llm.invoke(prompt)
    final = getattr(response, "content", str(response))
    return {"final_report": final}

In [258]:
from typing import TypedDict, Annotated, List, Union, Literal
from langchain_core.messages import BaseMessage
from operator import itemgetter # Required for LangGraph state management
from langchain_core.messages import HumanMessage, AIMessage

# Define the Reducer Function (Crucial for conversation memory)
def add_messages(left: list, right: list):
    """Reducer function to correctly append new messages to the list."""
    # This function is how LangGraph merges new messages into the state's message list
    return left + right

# Define the State
class FinancialAgentState(TypedDict, total=False):
    """
    Represents the state of our financial assistant conversation.
    It combines the conversational history with explicit data fields 
    for multi-step financial analysis.
    """
    # --- CORE LANGGRAPH FIELDS ---
    messages: Annotated[List[Union[HumanMessage, AIMessage]], add_messages]
    
    # --- FINANCIAL WORKFLOW FIELDS (Replaces city, weather, tourist) ---
    query: str          # The original user query
    topic: str          # Replaces 'city'. The main subject (e.g., 'NVIDIA vs INTEL')
    
    # Stores all data retrieved from tools (OpenBB, WebSearch). Replaces 'evidence'.
    raw_data: str 
    
    # The final, synthesized human-readable output (Replaces 'final_report')
    final_report: str
    
    # A control variable to manage flow between nodes (e.g., in a router)
    route: Literal["web_search", "rag", "llm"]

In [260]:
def get_llm(model="gpt-4o-mini", temperature=0):
    return ChatOpenAI(model=model, temperature=temperature)

In [262]:
def build_classifier_llm():
    """
    Returns an LLM used only for routing/classification.
    No tools. Pure prompt → output.
    """
    return get_llm(
        model="gpt-4o-mini",
        temperature=0
    )

In [263]:
def build_financial_agent():
    model = get_llm(model="gpt-4o-mini", temperature=0)

    system_prompt = (
        "You are a helpful financial reasoning assistant. "
        "Analyze the user's query and provide structured reasoning. "
        "Only perform reasoning here — do not search the web or retrieve external data. "
        "If reasoning seems insufficient for the user's question, the router will decide another path."
    )

    return create_agent(
        model=model,
        tools=[],               # no tools
        system_prompt=system_prompt
    )

In [265]:
def refine_query_node(state: FinancialAgentState) -> FinancialAgentState:
    """
    Entry point for the financial assistant.
    Cleans, clarifies, and normalizes the user query to improve routing accuracy.
    """

    # Extract last HumanMessage from messages
    messages = state.get("messages", [])
    user_query = ""
    for msg in reversed(messages):
        if msg.type == "human":
            user_query = msg.content
            break

    if not user_query:
        return {"query": "", "topic": ""}

    # IMPORTANT: Use a RAW LLM, not the financial agent
    llm = get_llm(model="gpt-4o-mini", temperature=0)

    # 1. Refine the query
    refine_system = (
        "You are a financial reasoning assistant.\n"
        "Refine the user's query so downstream components can understand it.\n"
        "Return ONLY the improved query.\n"
        "Do NOT answer the question.\n"
    )

    resp = llm.invoke([
        {"role": "system", "content": refine_system},
        {"role": "user", "content": f"Refine this financial query: {user_query}"}
    ])

    refined = resp.content.strip()

    # 2. Extract topic
    topic_system = (
        "Extract the main financial topic from the query in 3–6 words.\n"
        "Return ONLY the topic phrase."
    )

    topic_resp = llm.invoke([
        {"role": "system", "content": topic_system},
        {"role": "user", "content": refined}
    ])

    topic = topic_resp.content.strip()

    # MUST return a dict
    return {
        "query": refined,
        "topic": topic
    }


In [267]:
# ------------------------------------------------------------
# 4. ROUTER NODE
# ------------------------------------------------------------
def router_node(state: FinancialAgentState) -> FinancialAgentState:
    """
    Classifies query into: web_search, rag, llm.
    """

    query = state.get("query", "").strip()

    if not query:
        return {"route": "llm"}

    llm = build_classifier_llm()

    resp = llm.invoke([
        {
            "role": "system",
            "content": (
                "Classify the user's financial query into EXACTLY one category:\n"
                "- web_search : if the question needs very recent or external market/news data.\n"
                "- rag        : if the question should use internal knowledgebase or company reports.\n"
                "- llm        : if the question is reasoning-oriented and needs no external data.\n\n"
                "Return ONLY one token: web_search, rag, or llm."
            )
        },
        {"role": "user", "content": query}
    ])

    route = resp.content.strip().lower()

    allowed = {"web_search", "rag", "llm"}
    if route not in allowed:
        route = "llm"

    return {"route": route}



# ------------------------------------------------------------
# 5. WEB SEARCH NODE (Tavily)
# ------------------------------------------------------------
def web_search_node(state: FinancialAgentState) -> FinancialAgentState:
    q = state.get("query") or state.get("city") or ""
    # Replace with TavilySearchResults run
    evidence = f"(Mock Tavily search results for: {q})"
    return {"evidence": evidence}



# ------------------------------------------------------------
# 6. RAG NODE (FAISS / docstore)
# ------------------------------------------------------------
def rag_node(state: FinancialAgentState) -> FinancialAgentState:
    q = state.get("query", "")
    # Replace with real RAG chain
    evidence = f"(Mock RAG retrieval for: {q})"
    return {"evidence": evidence}



# ------------------------------------------------------------
# 7. LLM REASONING NODE
# ------------------------------------------------------------
def llm_node(state: FinancialAgentState) -> FinancialAgentState:
    q = state.get("query", "")
    llm = get_llm()

    resp = llm.invoke([
        {"role": "system", "content": "Answer concisely."},
        {"role": "user", "content": q}
    ])

    return {"evidence": resp.content}

# ------------------------------------------------------------
# 8. FINAL FINANCIAL SUMMARIZER NODE
# ------------------------------------------------------------
def financial_summarizer_node(state: Dict[str, Any]) -> Dict[str, Any]:
    """
    Synthesize the final financial analysis report from evidence present in the state.
    
    Expects state to have:
      - topic (the stock ticker(s) or financial question)
      - raw_data (string containing all source data/summaries collected by tools)
    
    Returns {"final_report": "<text>"}
    """
    
    # Use a strong model for final synthesis
    llm = get_llm(model="gpt-4o") 
    
    # Use the new financial state keys: topic and raw_data
    topic = state.get("topic", state.get("query", "the requested financial subject"))
    raw_data = state.get("raw_data", "No data was retrieved by the tools.")

    # Define the system prompt for the Financial Analyst persona
    system_instruction = (
        "You are a Chief Investment Strategist and expert financial report writer. "
        "Your analysis must be detailed, objective, and actionable. Produce executive-level investment summaries. "
        "DO NOT repeat the raw data, only synthesize the insights into a coherent report."
    )
    
    user_instruction = f"""
Write a comprehensive **Financial Analysis and Investment Report** for the topic: **{topic}**. 
Base your analysis ONLY on the raw data and insights provided in the EVIDENCE section below.

If the topic involves a comparison (e.g., NVIDIA vs. INTEL), dedicate a section to the comparative analysis.
If the evidence includes a clear conclusion, explicitly state the rationale.

--- EVIDENCE (Raw Tool Data) ---
{raw_data}

Output format:
**Executive Summary and Investment Thesis**
- (Start with a 1-2 sentence recommendation or high-level conclusion.)

**Key Financial Metrics Analysis**
- (Analyze P/E ratio, market cap, growth rates, or other fundamental data from the evidence.)

**Comparative Analysis (If Applicable)**
- (If multiple stocks are involved, compare them directly based on the metrics you analyzed.)

**Recent Market Insights and News**
- (Summarize any external data/news collected.)

**Final Investment Outlook**
- (Provide a clear, detailed final recommendation on the action to take (e.g., Buy, Hold, Sell) or the final answer to the user's specific question.)
"""

    prompt = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": user_instruction}
    ]

    response = llm.invoke(prompt)
    return {"final_report": response.content}

In [268]:
def create_flow():
    """Creates the LangGraph flow for the financial analysis assistant."""
    graph = StateGraph(FinancialAgentState)

    # 1. Add Nodes
    graph.add_node("refine_query", refine_query_node)
    graph.set_entry_point("refine_query")
    


    graph.add_node("router", router_node)
    
    graph.add_node("web_search", web_search_node)
    graph.add_node("rag", rag_node)
    graph.add_node("llm_reason", llm_node)
    
    graph.add_node("summarizer", financial_summarizer_node)


    graph.add_edge("refine_query", "router")
    
    graph.add_conditional_edges(
        "router",
        lambda s: s["route"],
        {
            "web_search": "web_search",
            "rag": "rag",
            "llm": "llm_reason",
        }
    )

    # 5. Final Step
    graph.add_edge("web_search", "summarizer")
    graph.add_edge("rag", "summarizer")
    graph.add_edge("llm_reason", "summarizer")

    graph.add_edge("summarizer", END)

    return graph.compile()



In [270]:

if __name__ == "__main__":
    # Note: This is a simplified execution. A real setup requires the agent runnable
    # and tool definitions to be correctly scoped and initialized.
    flow = create_flow()

    input_state = {
        "messages": [
            HumanMessage(content="Compare the valuation and recent news for NVIDIA and Oracle.")
        ],
        "query": "Compare the valuation and recent news for NVIDIA and Oracle.",
        "topic": "NVIDIA vs Oracle Stocks",
        "raw_data": "",
    }

    result = flow.invoke(input_state)

    print("\n===== FINANCIAL REPORT =====\n")
    print(result.get("final_report") or result)


===== FINANCIAL REPORT =====

**Executive Summary and Investment Thesis**
- Based on the valuation metrics and recent market performance, NVIDIA presents a compelling growth opportunity, albeit with a higher risk profile due to its premium valuation. Oracle, on the other hand, offers a more stable investment with moderate growth prospects and a more attractive valuation. Investors should consider NVIDIA for aggressive growth exposure and Oracle for stability and income.

**Key Financial Metrics Analysis**
- **NVIDIA**: The company exhibits a high Price-to-Earnings (P/E) ratio, reflecting strong investor confidence in its future growth potential, driven by its leadership in AI and graphics processing units (GPUs). NVIDIA's market capitalization has surged, indicating robust market sentiment and expectations of continued revenue growth. However, the elevated P/E ratio suggests that the stock is priced for perfection, and any deviation from growth expectations could lead to significant v