In [None]:
# Add project root to sys.path so 'src' is importable
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root in sys.path:", project_root)
print("Current working directory:", os.getcwd())

In [None]:
# Cell 5: Import necessary classes and modules
import os
from dotenv import load_dotenv
import io # Import the io module for BytesIO

# Assuming your src directory is installable with __init__.py files
from src.llm.GeminiClient import GeminiClient
from src.services.FinancialDocParser import FinancialDocParser

# Load environment variables again (good practice in notebooks)
load_dotenv()

# Verify Gemini Key (Crucial for FinancialDocParser)
gemini_key = os.environ.get("GEMINI_API_KEY")
if gemini_key:
    print(f"Gemini API Key loaded: {gemini_key[:3]}...{gemini_key[-4:]}")
else:
    print("Gemini API Key not loaded. FinancialDocParser will likely fail.")
    # Consider stopping here if the key is missing

In [None]:
# Cell 6: Define PDF path and test FinancialDocParser

# --- Configuration ---
# Make sure this path points to your test PDF
TEST_PDF_PATH = "../data/source_pdfs/10k_tesla_3_pages.pdf" # Adjust filename as needed

# --- End Configuration ---

# Check if the test PDF file exists
if not os.path.exists(TEST_PDF_PATH):
    print(f"Error: Test PDF not found at {TEST_PDF_PATH}")
    print("Please update TEST_PDF_PATH to point to a valid PDF file in your data folder.")
else:
    print(f"Found test PDF: {TEST_PDF_PATH}")
    try:
        # Instantiate GeminiClient (required by FinancialDocParser)
        gemini_client = GeminiClient()

        # Instantiate FinancialDocParser
        doc_parser = FinancialDocParser(gemini_client=gemini_client)

        # Open the PDF file in binary read mode and read into a buffer
        with open(TEST_PDF_PATH, 'rb') as f:
            pdf_buffer = io.BytesIO(f.read()) # Create a buffer from the file content

        print(f"\nProcessing PDF: {TEST_PDF_PATH}")

        # Run the parser on the PDF buffer
        parsing_result = doc_parser.parse_pdf_to_markdown(pdf_buffer)

        # Print the results
        print("\n--- Parsing Result ---")
        if parsing_result.get("error"):
            print(f"Parsing failed: {parsing_result['error']}")
        else:
            print(f"Parsing successful. Pages processed: {parsing_result.get('page_count')}\n\n")
            # Print only the first 1000 characters of the markdown to avoid flooding output
            markdown_content = parsing_result.get('markdown_content', '')
            print(markdown_content[:1000] + ('...' if len(markdown_content) > 1000 else ''))
            if len(markdown_content) > 1000:
                 print(f"(Full markdown content is {len(markdown_content)} characters long)")


    except ValueError as e:
        print(f"Configuration Error: {e}") # e.g., API key missing
    except Exception as e:
        print(f"An unexpected error occurred during PDF parsing test: {e}")

In [None]:
from IPython.display import display, Markdown, Latex
print(f"\n(Full markdown content is {len(markdown_content)} characters long)\n\n")
display(Markdown(markdown_content))
