In [1]:
# Cell 1: Setup - Add project root to sys.path and install necessary libraries
# This ensures we can import from the 'src' directory
import sys
import os

# Get the absolute path to the project root directory
# Assuming the notebook is in AI-CFO-FYP/notebooks/
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the project root to the system path if it's not already there
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root added to sys.path:", project_root)
print("Current working directory:", os.getcwd())

# Reminder to ensure libraries are installed
# !pip install -r ../requirements.txt # Uncomment if you need to install all requirements
# You specifically need 'chonkie' and 'pydantic' for this notebook
# !pip install chonkie pydantic

print("\n--- Setup Complete ---")

Project root added to sys.path: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP
Current working directory: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP\notebooks

--- Setup Complete ---


In [2]:
# Cell 2: Import necessary modules and classes
import uuid # Standard library for UUIDs
from typing import List, Dict, Any # Standard library for type hinting
from datetime import date # Standard library, potentially useful for date validation later

# Import classes from our src directory structure
# These imports assume the structure: src/services/Sectioner.py, etc.
from src.services.Sectioner import Sectioner, SectionData # Import Sectioner to generate test input
from src.services.ChunkingService import ChunkingService, ChunkData
from src.services.MetadataExtractor import FinancialDocumentMetadata # Pydantic model for metadata
from src.enums import FinancialDocSpecificType, DocType # Enums for metadata types

print("Imported Sectioner, ChunkingService, FinancialDocumentMetadata, Enums.")
print("\n--- Imports Complete ---")

Imported Sectioner, ChunkingService, FinancialDocumentMetadata, Enums.

--- Imports Complete ---


In [3]:
# Cell 3: Prepare Sample Data for ChunkingService Testing
# We need a realistic sample markdown output (like from FinancialDocParser)
# and sample document metadata (like from MetadataExtractor)
# We also need to explicitly define the document_id and user_id used throughout the test

# --- Define Explicit Test IDs ---
# Using specific UUIDs for clarity and reliable comparison in assertions
TEST_DOCUMENT_ID = uuid.uuid4()
TEST_USER_ID = uuid.uuid4()

print(f"Using explicit TEST_DOCUMENT_ID: {TEST_DOCUMENT_ID}")
print(f"Using explicit TEST_USER_ID:     {TEST_USER_ID}")


# --- Sample Markdown Content ---
# Designed to have multiple sections, some potentially larger than the chunk size (128 in test)
# Includes page markers to simulate parser output
sample_markdown_complex = """
--- Page 1 Start ---

# Annual Financial Report 2023 - Acme Corporation

This report presents the audited financial results of Acme Corporation for the fiscal year ended December 31, 2023. It includes the consolidated balance sheets as of December 31, 2023 and 2022, the related consolidated statements of income, comprehensive income, changes in stockholders' equity, and cash flows for each of the three years in the period ended December 31, 2023, and notes to consolidated financial statements. Our performance reflects significant market challenges but also strategic resilience.

--- Page 1 End ---

--- Page 2 Start ---

## Section 1: Management Discussion and Analysis

### Overview

Acme Corporation demonstrated solid financial performance in 2023, navigating a complex global economic landscape. Revenue growth was tempered by currency fluctuations, but operating margins improved due to cost efficiencies. We continued to invest in our core technologies and expand our market presence in key regions. The outlook for 2024 remains cautiously optimistic, focusing on innovation and sustainable growth initiatives. Our strategic priorities align with long-term value creation for our shareholders. Management is confident in our ability to execute against our plans. This section provides a detailed review of our operations and financial condition, offering insights beyond the raw numbers presented in the statements.

--- Page 2 End ---

--- Page 3 Start ---

## Section 2: Consolidated Statements of Income

| Item                       | 2023 ($M) | 2022 ($M) | 2021 ($M) |
|----------------------------|-----------|-----------|-----------|
| Net Sales                  | 1200      | 1040      | 950       |
| Cost of Goods Sold         | 500       | 450       | 420       |
| **Gross Profit**           | **700**   | **590**   | **530**   |
| Operating Expenses         |           |           |           |
|   Research & Development   | 150       | 130       | 120       |
|   Sales & Marketing        | 200       | 180       | 170       |
|   General & Administrative | 100       | 90        | 85        |
| **Total Operating Expenses** | **450**   | **400**   | **375**   |
| Income from Operations     | 250       | 190       | 155       |
| Other Income (Expense)     | 10        | 5         | 8         |
| Interest Expense           | (20)      | (15)      | (12)      |
| **Income Before Taxes**    | **240**   | **180**   | **151**   |
| Income Tax Expense         | 40        | 30        | 25        |
| **Net Income**             | **200**   | **150**   | **126**   |

--- Page 3 End ---

--- Page 4 Start ---

## Section 3: Consolidated Balance Sheets

### Assets

| Item                     | Dec 31, 2023 ($M) | Dec 31, 2022 ($M) |
|--------------------------|-------------------|-------------------|
| Current Assets           |                   |                   |
|   Cash and Equivalents   | 50                | 40                |
|   Accounts Receivable    | 100               | 90                |
|   Inventory              | 120               | 110               |
|   Prepaid Expenses       | 30                | 25                |
| **Total Current Assets** | **300**           | **265**           |
| Non-Current Assets       |                   |                   |
|   Property, Plant & Equip| 500               | 480               |
|   Intangible Assets      | 150               | 160               |
|   Goodwill               | 200               | 200               |
| **Total Non-Current Assets**| **850**           | **840**           |
| **TOTAL ASSETS**         | **1150**          | **1105**          |

### Liabilities and Stockholders' Equity

| Item                         | Dec 31, 2023 ($M) | Dec 31, 2022 ($M) |
|------------------------------|-------------------|-------------------|
| Current Liabilities          |                   |                   |
|   Accounts Payable           | 80                | 75                |
|   Accrued Expenses           | 70                | 65                |
|   Short-term Debt            | 50                | 60                |
| **Total Current Liabilities**| **200**           | **200**           |
| Non-Current Liabilities      |                   |                   |
|   Long-term Debt             | 300               | 320               |
| **Total Non-Current Liabilities**| **300**           | **320**           |
| Stockholders' Equity       |                   |                   |
|   Common Stock             | 100               | 100               |
|   Retained Earnings        | 550               | 485               |
| **Total Stockholders' Equity**| **650**           | **585**           |
| **TOTAL LIABILITIES AND STOCKHOLDERS' EQUITY** | **1150** | **1105** |

--- Page 4 End ---
""" # Total 4 pages, 4 main sections including document start

# --- Use Sectioner to get structured sections from sample markdown ---
print("\nUsing Sectioner to prepare sections from sample markdown...")
sectioner = Sectioner() # Instantiate the Sectioner

# Use the explicit TEST_DOCUMENT_ID and TEST_USER_ID here
sections_output: List[SectionData] = sectioner.section_markdown(
    markdown_content=sample_markdown_complex,
    document_id=TEST_DOCUMENT_ID, # Pass the explicit IDs
    user_id=TEST_USER_ID         # Pass the explicit IDs
)
print(f"Sectioner produced {len(sections_output)} sections.")

# Add dummy IDs to sections_output for ChunkingService input
# The Sectioner code doesn't add 'id', so we add it here manually for the test setup.
# This simulates sections being saved to DB and getting IDs before chunking.
print("\nAdding dummy section IDs to sections data for ChunkingService input...")
for section in sections_output:
    section['id'] = uuid.uuid4()
    print(f"  Section {section['section_index']} '{section['section_heading']}': Added ID {section['id']}")

# --- Manually create sample FinancialDocumentMetadata ---
# Use explicit placeholder values that ChunkingService should handle (None, -1, "1900-01-01")
sample_metadata = FinancialDocumentMetadata(
    doc_specific_type=FinancialDocSpecificType.ANNUAL_REPORT,
    company_name="Acme Corporation", # Use the full name from sample markdown
    report_date="2023-12-31",
    doc_year=2023,
    doc_quarter=-1, # Using -1 as the placeholder for null quarter
    doc_summary="Audited financial results for Acme Corp's fiscal year 2023, including statements and notes.",
    # Assume currency/units are handled in JSONB metadata, not in this Pydantic model
)
print("\nCreated sample document metadata:")
print(sample_metadata.model_dump_json(indent=2))

print("\n--- Sample Data Preparation Complete ---")

Using explicit TEST_DOCUMENT_ID: 1aabb848-9869-494c-ae40-4474b82d4caf
Using explicit TEST_USER_ID:     95e67fd9-ece9-497b-99e8-8fe165d69b22

Using Sectioner to prepare sections from sample markdown...
Initialized Sectioner.
Sectioning markdown content (82 lines)...
Finalized section 0: 'Document Start'
Finalized section 1: 'Annual Financial Report 2023 - Acme Corporation'
Finalized section 2: 'Section 1: Management Discussion and Analysis'
Finalized section 3: 'Overview'
Finalized section 4: 'Section 2: Consolidated Statements of Income'
Finalized section 5: 'Section 3: Consolidated Balance Sheets'
Finalized section 6: 'Assets'
Finalized last section 7: 'Liabilities and Stockholders' Equity'
Sectioning complete. Created 8 sections.
Sectioner produced 8 sections.

Adding dummy section IDs to sections data for ChunkingService input...
  Section 0 'Document Start': Added ID 99c61560-17cd-4122-bf5c-5316009486a2
  Section 1 'Annual Financial Report 2023 - Acme Corporation': Added ID fdd999a

In [4]:
# Cell 4: Test ChunkingService with normal sections and metadata

print("\n--- Testing ChunkingService with sections and metadata ---")
try:
    # Instantiate ChunkingService with desired parameters
    # Using smaller chunk_size to see more splits across larger sections
    chunking_service = ChunkingService(
        chunk_size=128, # Tokens
        min_characters_per_chunk=50 # Increase minimum chars to avoid very small chunks
    )
    print("ChunkingService instantiated.")

    # Run the chunking process using the prepared data and explicit IDs
    all_generated_chunks: List[ChunkData] = chunking_service.chunk_sections(
        sections=sections_output, # Use the output from Sectioner (with added IDs)
        document_metadata=sample_metadata,
        document_id=TEST_DOCUMENT_ID, # Pass the explicit TEST_DOCUMENT_ID
        user_id=TEST_USER_ID         # Pass the explicit TEST_USER_ID
    )

    print(f"\nChunkingService generated a total of {len(all_generated_chunks)} chunks.")

    # --- Thorough Verification of Generated Chunk Data ---
    print("\n--- Verifying and Showing Sample of Generated Chunk Data ---")

    if not all_generated_chunks:
        print("No chunks were generated.")
    else:
        num_chunks_to_show = min(len(all_generated_chunks), 7) # Show up to 7 chunks
        print(f"Showing details for the first {num_chunks_to_show} chunks:")

        for i, chunk in enumerate(all_generated_chunks[:num_chunks_to_show]):
            print(f"\n--- Chunk {i+1} (Index {chunk.get('chunk_index', 'N/A')} in Section {chunk.get('section_index', 'N/A')}) ---")

            # 1. Check presence of required keys
            required_keys = ['chunk_text', 'chunk_index', 'start_char_index', 'end_char_index',
                             'section_id', 'document_id', 'user_id',
                             'doc_specific_type', 'doc_year', 'doc_quarter', 'company_name',
                             'report_date', 'section_heading', 'metadata'] # metadata JSONB column
            for key in required_keys:
                assert key in chunk, f"Chunk {i+1} missing required key: {key}"
            print("  All required keys are present.")

            # 2. Verify IDs are correct and have correct types
            assert chunk['document_id'] == TEST_DOCUMENT_ID, \
                f"Chunk {i+1} Document ID mismatch: Expected {TEST_DOCUMENT_ID}, Got {chunk['document_id']}"
            assert chunk['user_id'] == TEST_USER_ID, \
                f"Chunk {i+1} User ID mismatch: Expected {TEST_USER_ID}, Got {chunk['user_id']}"
            assert isinstance(chunk['section_id'], uuid.UUID), \
                f"Chunk {i+1} Section ID is not a UUID: {chunk['section_id']}"
            # Optional: Check if section_id matches one of the input section IDs - requires more complex test setup
            print("  Document ID and User ID match the test IDs.")
            print("  Section ID is a valid UUID.")


            # 3. Verify Copied Metadata values match sample_metadata (handling None conversions)
            assert chunk['doc_specific_type'] == sample_metadata.doc_specific_type.value, \
                f"Chunk {i+1} doc_specific_type mismatch: Expected {sample_metadata.doc_specific_type.value}, Got {chunk['doc_specific_type']}"
            assert chunk['doc_year'] == sample_metadata.doc_year, \
                 f"Chunk {i+1} doc_year mismatch: Expected {sample_metadata.doc_year}, Got {chunk['doc_year']}"
            # Check for doc_quarter == None if sample_metadata.doc_quarter was -1
            expected_quarter = sample_metadata.doc_quarter if sample_metadata.doc_quarter != -1 else None
            assert chunk['doc_quarter'] == expected_quarter, \
                f"Chunk {i+1} doc_quarter mismatch: Expected {expected_quarter}, Got {chunk['doc_quarter']}"
            assert chunk['company_name'] == sample_metadata.company_name, \
                 f"Chunk {i+1} company_name mismatch: Expected '{sample_metadata.company_name}', Got '{chunk['company_name']}'"
            # Check for report_date == None if sample_metadata.report_date was "1900-01-01"
            expected_report_date = sample_metadata.report_date if sample_metadata.report_date != "1900-01-01" else None
            assert chunk['report_date'] == expected_report_date, \
                f"Chunk {i+1} report_date mismatch: Expected '{expected_report_date}', Got '{chunk['report_date']}'"
            assert chunk['section_heading'] is not None and len(chunk['section_heading']) > 0, \
                 f"Chunk {i+1} section_heading is missing or empty: '{chunk['section_heading']}'"
            print("  Copied metadata values match the sample metadata.")


            # 4. Verify chunk text and position
            assert len(chunk['chunk_text']) > 0, f"Chunk {i+1} has empty chunk_text"
            assert chunk['start_char_index'] >= 0, f"Chunk {i+1} invalid start_char_index: {chunk['start_char_index']}"
            assert chunk['end_char_index'] > chunk['start_char_index'], f"Chunk {i+1} invalid end_char_index: {chunk['end_char_index']}"
            # More advanced check: Verify chunk text matches substring of parent section content
            # This requires retrieving the parent section by section_id from the sections_output list
            parent_section = next((s for s in sections_output if s['id'] == chunk['section_id']), None)
            assert parent_section is not None, f"Chunk {i+1} parent section not found in input list!"
            expected_text_from_section = parent_section['content_markdown'][chunk['start_char_index']:chunk['end_char_index']]
            assert chunk['chunk_text'] == expected_text_from_section, \
                f"Chunk {i+1} text mismatch with parent section content substring!"
            print("  Chunk text and position are consistent with parent section.")

            # 5. Check index
            assert chunk['chunk_index'] >= 0, f"Chunk {i+1} invalid chunk_index: {chunk['chunk_index']}"
            # We expect multiple chunks for larger sections, so indices should increment
            # This is harder to assert generally, but inspecting output confirms it
            print("  Chunk index is valid.")


            # Print some details again after verification
            print(f"\n  Chunk Text Preview: {chunk['chunk_text'][:150]}...")
            print(f"  Section Heading: '{chunk['section_heading']}'")
            print(f"  Position in Section (chars): {chunk['start_char_index']}-{chunk['end_char_index']}")
            print(f"  Parent Section Index: {next((s['section_index'] for s in sections_output if s['id'] == chunk['section_id']), 'N/A')}")

        print(f"\n--- Verification of first {num_chunks_to_show} chunks complete. ---")

    print("\nAll assertions in Cell 4 passed.")

except AssertionError as e:
    print(f"\nTEST FAILED: Assertion Error - {e}")
except Exception as e:
    print(f"\nTEST FAILED: An unexpected error occurred during ChunkingService test - {e}")


--- Testing ChunkingService with sections and metadata ---
Initializing ChunkingService with RecursiveChunker...


  from .autonotebook import tqdm as notebook_tqdm


RecursiveChunker initialized with chunk_size=128, min_chars=50
ChunkingService instantiated.
Chunking 8 sections...
Chunking section 'Document Start' (Index: 0)...
Chunked section 'Document Start' into 1 chunks.
Chunking section 'Annual Financial Report 2023 - Acme Corporation' (Index: 1)...
Chunked section 'Annual Financial Report 2023 - Acme Corporation' into 1 chunks.
Chunking section 'Section 1: Management Discussion and Analysis' (Index: 2)...
Chunked section 'Section 1: Management Discussion and Analysis' into 1 chunks.
Chunking section 'Overview' (Index: 3)...
Chunked section 'Overview' into 2 chunks.
Chunking section 'Section 2: Consolidated Statements of Income' (Index: 4)...
Chunked section 'Section 2: Consolidated Statements of Income' into 7 chunks.
Chunking section 'Section 3: Consolidated Balance Sheets' (Index: 5)...
Chunked section 'Section 3: Consolidated Balance Sheets' into 1 chunks.
Chunking section 'Assets' (Index: 6)...
Chunked section 'Assets' into 7 chunks.
Chun

In [5]:
# Cell 5: Test ChunkingService with an empty sections list

print("\n--- Testing ChunkingService with empty sections list ---")
try:
    chunking_service_empty = ChunkingService(chunk_size=128, min_characters_per_chunk=50)
    empty_sections: List[SectionData] = [] # Empty list of sections

    # Reuse explicit IDs and metadata for consistency
    chunks_from_empty = chunking_service_empty.chunk_sections(
        sections=empty_sections,
        document_metadata=sample_metadata,
        document_id=TEST_DOCUMENT_ID,
        user_id=TEST_USER_ID
    )

    print(f"\nChunkingService generated {len(chunks_from_empty)} chunks from empty input.")
    assert len(chunks_from_empty) == 0, f"Expected 0 chunks, but got {len(chunks_from_empty)}"

    print("Test passed: Correctly returned 0 chunks for empty sections list.")

except Exception as e:
     print(f"\nTEST FAILED: An error occurred during empty sections test: {e}")


--- Testing ChunkingService with empty sections list ---
Initializing ChunkingService with RecursiveChunker...
RecursiveChunker initialized with chunk_size=128, min_chars=50
Chunking 0 sections...
ChunkingService completed. Total chunks generated: 0

ChunkingService generated 0 chunks from empty input.
Test passed: Correctly returned 0 chunks for empty sections list.


In [6]:
# Cell 6: Test ChunkingService with a section that has empty content

print("\n--- Testing ChunkingService with a section having empty content ---")
try:
    chunking_service_empty_content = ChunkingService(chunk_size=128, min_characters_per_chunk=50)

    # Create a dummy section with content that is just whitespace or empty after stripping
    # Use explicit IDs for consistency
    section_with_empty_content: SectionData = {
        "id": uuid.uuid4(), # Needs a dummy ID
        "document_id": TEST_DOCUMENT_ID,
        "user_id": TEST_USER_ID,
        "section_heading": "Empty Content Section",
        "page_numbers": [10],
        "content_markdown": "   \n\n# Heading in whitespace \n", # Content that should become empty after strip
        "section_index": 99,
    }

    sections_with_one_empty = [section_with_empty_content]

    chunks_from_empty_content = chunking_service_empty_content.chunk_sections(
        sections=sections_with_one_empty,
        document_metadata=sample_metadata,
        document_id=TEST_DOCUMENT_ID,
        user_id=TEST_USER_ID
    )

    print(f"\nChunkingService generated {len(chunks_from_empty_content)} chunks from section with empty content.")
    # Expect 0 chunks because the ChunkingService should skip sections with strip() returning empty string
    assert len(chunks_from_empty_content) == 0, f"Expected 0 chunks, but got {len(chunks_from_empty_content)}"

    print("Test passed: Correctly skipped chunking for section with empty content.")

except Exception as e:
     print(f"\nTEST FAILED: An error occurred during empty content section test: {e}")


--- Testing ChunkingService with a section having empty content ---
Initializing ChunkingService with RecursiveChunker...
RecursiveChunker initialized with chunk_size=128, min_chars=50
Chunking 1 sections...
Chunking section 'Empty Content Section' (Index: 99)...
Chunked section 'Empty Content Section' into 1 chunks.
ChunkingService completed. Total chunks generated: 1

ChunkingService generated 1 chunks from section with empty content.

TEST FAILED: An error occurred during empty content section test: Expected 0 chunks, but got 1
