In [1]:
# Add project root to sys.path so 'src' is importable
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root in sys.path:", project_root)
print("Current working directory:", os.getcwd())

Project root in sys.path: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP
Current working directory: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP\notebooks


In [2]:
# Cell 2: Import necessary modules and classes
import uuid # Standard library for UUIDs
from datetime import date # For testing date conversion later if needed

# Import classes from our src directory
from src.services.Sectioner import Sectioner, SectionData
from src.services.ChunkingService import ChunkingService, ChunkData
from src.services.MetadataExtractor import FinancialDocumentMetadata # Pydantic model
from src.enums import FinancialDocSpecificType, DocType # Enums

print("Imported Sectioner, ChunkingService, FinancialDocumentMetadata, Enums")

Imported Sectioner, ChunkingService, FinancialDocumentMetadata, Enums


In [3]:
# Cell 3: Prepare Sample Data
# We need a sample markdown output (like from FinancialDocParser)
# and sample document metadata (like from MetadataExtractor)

# --- Sample Markdown (similar to the previous test) ---
from typing import List


sample_markdown = """
--- Page 1 Start ---

# Annual Report 2023

This is the introductory text for the annual report of Acme Corp. It covers the period from
January 1, 2023 to December 31, 2023.

--- Page 1 End ---

--- Page 2 Start ---

## Section 1: Financial Highlights

Key figures for the year:
Revenue: $1.2B
Net Income: $200M

--- Page 2 End ---

--- Page 3 Start ---

## Section 2: Income Statement

Detailed revenue analysis:

| Item          | 2023 ($M) | 2022 ($M) |
|---------------|-----------|-----------|
| Product Sales | 800       | 700       |
| Service Fees  | 400       | 340       |
| **Total**     | **1200**  | **1040**  |

### Operating Expenses

Breakdown of costs.

--- Page 3 End ---

--- Page 4 Start ---

Detailed expense breakdown:
- Research & Development: $150M
- Marketing: $200M
- General & Administrative: $100M

--- Page 4 End ---

--- Page 5 Start ---

## Section 3: Balance Sheet

Assets and Liabilities.

### Assets

Current Assets:
- Cash: $50M
- Accounts Receivable: $100M

--- Page 5 End ---

## Empty Section Test
    # This section has no actual content

--- Page 6 Start ---
    # Page 6 only has the marker and nothing else
--- Page 6 End ---
"""

# --- Use Sectioner to get structured sections from sample markdown ---
# We need the Sectioner output as input for the ChunkingService
print("Using Sectioner to prepare sections from sample markdown...")
sectioner = Sectioner() # Instantiate the Sectioner

# Generate dummy IDs for the document and user
test_document_id = uuid.uuid4()
test_user_id = uuid.uuid4()

# Get the list of sections
sections_output: List[SectionData] = sectioner.section_markdown(
    markdown_content=sample_markdown,
    document_id=test_document_id,
    user_id=test_user_id
)
print(f"Sectioner produced {len(sections_output)} sections.")

# --- Manually create sample FinancialDocumentMetadata ---
# This simulates the output from MetadataExtractor
sample_metadata = FinancialDocumentMetadata(
    doc_specific_type=FinancialDocSpecificType.ANNUAL_REPORT,
    company_name="Acme Corp",
    report_date="2023-12-31", # Keep as string as per Pydantic model
    doc_year=2023,
    doc_quarter=-1, # Using -1 as the placeholder for null quarter
    doc_summary="Summary of Acme Corp's 2023 annual performance.",
)
print("\nCreated sample document metadata:")
print(sample_metadata.model_dump_json(indent=2)) # Use model_dump_json for Pydantic v2+

# IMPORTANT: Add dummy IDs to sections_output if sectioner didn't add them
# The ChunkingService expects section_id to be present for FK linking.
# In the real pipeline, the sections would be saved to DB *before* chunking,
# and their DB-generated IDs would be used. For testing, we simulate this.
# The Sectioner code already includes an optional get("id", uuid.uuid4()) - let's ensure that's handled.
# If the Sectioner output *doesn't* add IDs, you might need to add them here manually for the test.
# Based on the Sectioner code, the 'id' key is NOT added, so we'll add it now for the test.
print("\nAdding dummy section IDs to sections data for ChunkingService input...")
for section in sections_output:
    section['id'] = uuid.uuid4()
    print(f"  Section {section['section_index']} '{section['section_heading']}': Added ID {section['id']}")

Using Sectioner to prepare sections from sample markdown...
Initialized Sectioner.
Sectioning markdown content (67 lines)...
Finalized section 0: 'Document Start'
Finalized section 1: 'Annual Report 2023'
Finalized section 2: 'Section 1: Financial Highlights'
Finalized section 3: 'Section 2: Income Statement'
Finalized section 4: 'Operating Expenses'
Finalized section 5: 'Section 3: Balance Sheet'
Finalized section 6: 'Assets'
Finalized last section 7: 'Empty Section Test'
Sectioning complete. Created 8 sections.
Sectioner produced 8 sections.

Created sample document metadata:
{
  "doc_specific_type": "Annual Report",
  "company_name": "Acme Corp",
  "report_date": "2023-12-31",
  "doc_year": 2023,
  "doc_quarter": -1,
  "doc_summary": "Summary of Acme Corp's 2023 annual performance."
}

Adding dummy section IDs to sections data for ChunkingService input...
  Section 0 'Document Start': Added ID 8fd82d7d-3125-4c0c-9053-ff20321ab026
  Section 1 'Annual Report 2023': Added ID 3d04b1be-5

In [4]:
# Cell 4: Test ChunkingService with normal sections and metadata

print("\n--- Testing ChunkingService with normal sections and metadata ---")
try:
    # Instantiate ChunkingService with desired parameters
    # Using smaller chunk_size for this test to see more chunks
    chunking_service = ChunkingService(
        chunk_size=128, # Smaller chunk size to see more splits
        min_characters_per_chunk=24
    )

    # Run the chunking process
    all_generated_chunks: List[ChunkData] = chunking_service.chunk_sections(
        sections=sections_output, # Use the output from Sectioner (with added IDs)
        document_metadata=sample_metadata,
        document_id=test_document_id,
        user_id=test_user_id
    )

    print(f"\nChunkingService generated a total of {len(all_generated_chunks)} chunks.")

    # --- Verify and Print Chunk Data ---
    if all_generated_chunks:
        print("\n--- Sample of Generated Chunk Data ---")
        # Print details for the first few chunks to inspect structure and content
        num_chunks_to_show = 5
        for i, chunk in enumerate(all_generated_chunks[:num_chunks_to_show]):
            print(f"\nChunk {i+1}:")
            print(f"  Chunk Text Preview: {chunk['chunk_text'][:150]}...")
            print(f"  Chunk Length: {len(chunk['chunk_text'])} chars")
            print(f"  Chunk Index (within section): {chunk['chunk_index']}")
            print(f"  Section ID: {chunk['section_id']}")
            print(f"  Document ID: {chunk['document_id']}")
            print(f"  User ID: {chunk['user_id']}")
            print(f"  Copied Metadata:")
            print(f"    doc_specific_type: {chunk['doc_specific_type']}")
            print(f"    doc_year: {chunk['doc_year']}")
            print(f"    doc_quarter: {chunk['doc_quarter']}")
            print(f"    company_name: {chunk['company_name']}")
            print(f"    report_date: {chunk['report_date']}")
            print(f"    section_heading: {chunk['section_heading']}")
            # Verify data types/values for copied metadata
            assert chunk['document_id'] == test_document_id
            assert chunk['user_id'] == test_user_id
            assert isinstance(chunk['section_id'], uuid.UUID) # Should be a valid UUID
            assert chunk['doc_specific_type'] == sample_metadata.doc_specific_type.value
            assert chunk['doc_year'] == sample_metadata.doc_year
            assert chunk['doc_quarter'] == sample_metadata.doc_quarter
            assert chunk['company_name'] == sample_metadata.company_name
            # Note: date format for report_date string comparison
            assert chunk['report_date'] == sample_metadata.report_date
            assert chunk['section_heading'] is not None # Should have a heading

        print(f"\n...Showing first {num_chunks_to_show} of {len(all_generated_chunks)} chunks.")
        print("\nVerification Points:")
        print(f"- Total chunks generated: {len(all_generated_chunks)} (Should be > num sections if sections are large)")
        print("- Each chunk should have correct document_id and user_id.")
        print("- Each chunk should have a unique section_id matching one of the input sections.")
        print("- Copied metadata fields should match the sample_metadata.")
        print("- chunk_text should be parts of the original section content.")
        print("- chunk_index, start_char_index, end_char_index should reflect position within the section.")

    else:
        print("ChunkingService returned an empty list of chunks.")

except Exception as e:
    print(f"\nAn error occurred during ChunkingService test: {e}")


--- Testing ChunkingService with normal sections and metadata ---
Initializing ChunkingService with RecursiveChunker...


  from .autonotebook import tqdm as notebook_tqdm


RecursiveChunker initialized with chunk_size=128, min_chars=24
Chunking 8 sections...
Chunking section 'Document Start' (Index: 0)...
Chunked section 'Document Start' into 1 chunks.
Chunking section 'Annual Report 2023' (Index: 1)...
Chunked section 'Annual Report 2023' into 1 chunks.
Chunking section 'Section 1: Financial Highlights' (Index: 2)...
Chunked section 'Section 1: Financial Highlights' into 1 chunks.
Chunking section 'Section 2: Income Statement' (Index: 3)...
Chunked section 'Section 2: Income Statement' into 1 chunks.
Chunking section 'Operating Expenses' (Index: 4)...
Chunked section 'Operating Expenses' into 1 chunks.
Chunking section 'Section 3: Balance Sheet' (Index: 5)...
Chunked section 'Section 3: Balance Sheet' into 1 chunks.
Chunking section 'Assets' (Index: 6)...
Chunked section 'Assets' into 1 chunks.
Chunking section 'Empty Section Test' (Index: 7)...
Chunked section 'Empty Section Test' into 1 chunks.
ChunkingService completed. Total chunks generated: 8

Chun

In [5]:
# Cell 6: Test ChunkingService with a section that has empty content

print("\n--- Testing ChunkingService with a section having empty content ---")
try:
    chunking_service_empty_content = ChunkingService(chunk_size=128)

    # Create a dummy section with empty content markdown
    section_with_empty_content: SectionData = {
        "id": uuid.uuid4(), # Needs a dummy ID
        "document_id": test_document_id,
        "user_id": test_user_id,
        "section_heading": "Empty Content Section",
        "page_numbers": [10],
        "content_markdown": "   \n\n", # Whitespace/empty content
        "section_index": 99,
    }

    sections_with_one_empty = [section_with_empty_content]

    chunks_from_empty_content = chunking_service_empty_content.chunk_sections(
        sections=sections_with_one_empty,
        document_metadata=sample_metadata,
        document_id=test_document_id,
        user_id=test_user_id
    )

    print(f"\nChunkingService generated {len(chunks_from_empty_content)} chunks from section with empty content.")
    assert len(chunks_from_empty_content) == 0 # Should skip chunking this section

    if len(chunks_from_empty_content) == 0:
        print("Test passed: Correctly skipped chunking for section with empty content.")
    else:
        print("Test failed: Returned chunks for section with empty content.")

except Exception as e:
     print(f"\nAn error occurred during empty content section test: {e}")


--- Testing ChunkingService with a section having empty content ---
Initializing ChunkingService with RecursiveChunker...
RecursiveChunker initialized with chunk_size=128, min_chars=24
Chunking 1 sections...
Section 'Empty Content Section' (99) has no content, skipping chunking.
ChunkingService completed. Total chunks generated: 0

ChunkingService generated 0 chunks from section with empty content.
Test passed: Correctly skipped chunking for section with empty content.
