In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
import time

def find_presentation_pdf(base_url, keywords=("presentation",)):
    # Setup Chrome options
    options = Options()
    options.add_argument("--headless")  # Run in headless mode if desired
    driver = webdriver.Chrome(options=options)

    try:
        print(f"Opening: {base_url}")
        driver.get(base_url)
        driver.implicitly_wait(10)

        # Find all links to PDFs
        pdf_links = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")

        for link in pdf_links:
            text = link.text.lower()
            href = link.get_attribute("href")
            # Check if the link text or nearby context matches keywords
            if any(keyword in text for keyword in keywords):
                full_link = urljoin(base_url, href)
                print("✅ Found matching presentation PDF:", full_link)
                return full_link

        print("❌ No matching presentation PDF found.")
        return None

    except Exception as e:
        print("Error:", e)
    finally:
        driver.quit()

# Example usage
if __name__ == "__main__":
    # Change this to any bank's investor page
    investor_url = "https://investor.bankofamerica.com/"
    keywords = ("presentation", "earnings", "q1", "results")  # Add more if needed

    pdf_link = find_presentation_pdf(investor_url, keywords)
    if pdf_link:
        # Optional: open in browser or download
        print("📄 PDF Link:", pdf_link)


Opening: https://investor.bankofamerica.com/
✅ Found matching presentation PDF: https://d1io3yog0oux5.cloudfront.net/_a7dcff183b29f29c742f216ce5e1c038/bankofamerica/db/780/10173/pdf/The+Presentation+Materials_1Q25_ADA.pdf
📄 PDF Link: https://d1io3yog0oux5.cloudfront.net/_a7dcff183b29f29c742f216ce5e1c038/bankofamerica/db/780/10173/pdf/The+Presentation+Materials_1Q25_ADA.pdf


In [7]:
import requests
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Download PDF
def download_pdf(pdf_url, save_path="presentation.pdf"):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(save_path, "wb") as f:
            f.write(response.content)
        print(f"✅ PDF downloaded to {save_path}")
        return save_path
    else:
        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

# Load and prepare context from PDF
def load_and_prepare_pdf(local_pdf_path):
    loader = PyPDFLoader(local_pdf_path)
    pages = loader.load_and_split()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.split_documents(pages)
    return "\n".join([doc.page_content for doc in docs[:5]])

# Delete the file
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"🗑️ Deleted file: {file_path}")
    else:
        print(f"⚠️ File not found: {file_path}")

# DeepSeek API call function
def query_deepseek(prompt, context):
    api_url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": "Bearer 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",  # replace with actual key
        "Content-Type": "application/json"
    }
    payload = {
        "model": "deepseek/deepseek-r1-0528:free",
        "messages": [
            {"role": "system", "content": "You are a financial analyst extracting precise numerical data."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {prompt}"}
        ],
        "temperature": 0
    }

    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()

# Main logic
if __name__ == "__main__":
    pdf_url = pdf_link  # Replace with actual URL
    local_path = "presentation.pdf"

    try:
        downloaded_path = download_pdf(pdf_url, local_path)
        context = load_and_prepare_pdf(downloaded_path)

        # Run financial queries
        queries = [
            "What is the total revenue reported? Provide exact value only",
            "What is the net income reported? Provide exact value only",
        ]

        for query in queries:
            response = query_deepseek(query, context)
            print(response)
            answer = response['choices'][0]['message']['content']
            print(f"Q: {query}\nA: {answer}\n")

    finally:
        delete_file(local_path)  # Always clean up the file


✅ PDF downloaded to presentation.pdf
{'id': 'gen-1748635261-CuaXoPF5FiXYjOVe7GTT', 'provider': 'Chutes', 'model': 'deepseek/deepseek-r1-0528:free', 'object': 'chat.completion', 'created': 1748635262, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '$27.4B', 'refusal': None, 'reasoning': 'Alright, let me tackle this user query. The user is asking for the total revenue reported by Bank of America in their 1Q25 financial results. They specifically want the exact value only. \n\nFirst, I recall that in financial contexts like this, "revenue" typically refers to the top-line income before expenses. Scanning the provided data, I spot the "Revenue $27.4B1" under the 1Q25 Highlights section. The superscript 1 indicates it\'s net of interest expense, which aligns with footnote 1\'s definition. \n\nHmm, I should verify if there\'s any ambiguity. The document also mentions net income ($7.4B) and EPS fi