In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
import time

def find_presentation_pdf(base_url, keywords=("presentation",), timeout=15):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)

    try:
        print(f"🌐 Opening: {base_url}")
        driver.get(base_url)

        # Optional: Wait for the page to load (customize if you know dynamic loading is needed)
        time.sleep(5)  # You can adjust this delay based on page complexity

        # Find all <a> tags with href ending with .pdf
        pdf_links = driver.find_elements(By.XPATH, "//a[contains(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '.pdf')]")
        print(f"🔍 Found {len(pdf_links)} PDF links on the page.")

        for link in pdf_links:
            href = link.get_attribute("href")
            link_text = link.text.strip().lower()
            if href:
                # Match keywords in href or link text
                if any(kw.lower() in link_text or kw.lower() in href.lower() for kw in keywords):
                    full_link = urljoin(base_url, href)
                    print("✅ Found matching PDF:", full_link)
                    # return full_link

        print("❌ No matching PDF found.")
        return None

    except Exception as e:
        print("❌ Error:", e)
        return None
    finally:
        driver.quit()

# Example usage
test_url ="https://www.jpmorganchase.com/ir/quarterly-earnings"

keywords = ("presentation", "q1 2025", "earnings")

pdf_link = find_presentation_pdf(test_url, keywords)
if pdf_link:
    print(f"📄 PDF Link: {pdf_link}")
else:
    print(f"❌ No PDF found")

🌐 Opening: https://www.jpmorganchase.com/ir/quarterly-earnings
🔍 Found 304 PDF links on the page.
✅ Found matching PDF: https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2025/1st-quarter/d88c408a-bbc9-4b06-b263-373f5b10b145.pdf
✅ Found matching PDF: https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2025/1st-quarter/dea6f825-073f-4b0f-9811-2aaefc5eec94.pdf
✅ Found matching PDF: https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2025/1st-quarter/e243f5ee-ff5b-4608-8ff3-a71eb55dc042.pdf
✅ Found matching PDF: https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2025/1st-quarter/1q25-earnings-transcript.pdf
✅ Found matching PDF: https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarte

In [7]:
import requests
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Download PDF
def download_pdf(pdf_url, save_path="presentation.pdf"):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(save_path, "wb") as f:
            f.write(response.content)
        print(f"✅ PDF downloaded to {save_path}")
        return save_path
    else:
        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

# Load and prepare context from PDF
def load_and_prepare_pdf(local_pdf_path):
    loader = PyPDFLoader(local_pdf_path)
    pages = loader.load_and_split()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.split_documents(pages)
    return "\n".join([doc.page_content for doc in docs[:5]])

# Delete the file
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"🗑️ Deleted file: {file_path}")
    else:
        print(f"⚠️ File not found: {file_path}")

# DeepSeek API call function
def query_deepseek(prompt, context):
    api_url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": "Bearer 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",  # replace with actual key
        "Content-Type": "application/json"
    }
    payload = {
        "model": "deepseek/deepseek-r1-0528:free",
        "messages": [
            {"role": "system", "content": "You are a financial analyst extracting precise numerical data."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {prompt}"}
        ],
        "temperature": 0
    }

    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()

# Main logic
if __name__ == "__main__":
    pdf_url = pdf_link  # Replace with actual URL
    local_path = "presentation.pdf"

    try:
        downloaded_path = download_pdf(pdf_url, local_path)
        context = load_and_prepare_pdf(downloaded_path)

        # Run financial queries
        queries = [
            "What is the total revenue reported? Provide exact value only",
            "What is the net income reported? Provide exact value only",
        ]

        for query in queries:
            response = query_deepseek(query, context)
            print(response)
            answer = response['choices'][0]['message']['content']
            print(f"Q: {query}\nA: {answer}\n")

    finally:
        delete_file(local_path)  # Always clean up the file


✅ PDF downloaded to presentation.pdf
{'id': 'gen-1748635261-CuaXoPF5FiXYjOVe7GTT', 'provider': 'Chutes', 'model': 'deepseek/deepseek-r1-0528:free', 'object': 'chat.completion', 'created': 1748635262, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '$27.4B', 'refusal': None, 'reasoning': 'Alright, let me tackle this user query. The user is asking for the total revenue reported by Bank of America in their 1Q25 financial results. They specifically want the exact value only. \n\nFirst, I recall that in financial contexts like this, "revenue" typically refers to the top-line income before expenses. Scanning the provided data, I spot the "Revenue $27.4B1" under the 1Q25 Highlights section. The superscript 1 indicates it\'s net of interest expense, which aligns with footnote 1\'s definition. \n\nHmm, I should verify if there\'s any ambiguity. The document also mentions net income ($7.4B) and EPS fi