In [1]:
# Cell 1: Setup - Add project root to sys.path and install necessary libraries
# This ensures we can import from the 'src' directory if needed later
import sys
import os

# Get the absolute path to the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root added to sys.path:", project_root)
print("Current working directory:", os.getcwd())

# Ensure necessary libraries are installed
# !pip install supabase python-dotenv pydantic # Need pydantic for metadata model
print("\n--- Setup Complete ---")

Project root added to sys.path: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP
Current working directory: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP\notebooks

--- Setup Complete ---


In [2]:
# Cell 2: Import necessary modules and classes
import os
import io
import uuid
from datetime import date # Needed if testing date fields directly
from dotenv import load_dotenv
from supabase import create_client, Client # Supabase client library

# Import the service and data structures we need to test/simulate
from src.services.SupabaseService import SupabaseService
from src.services.MetadataExtractor import FinancialDocumentMetadata # Input for save_document_record
from src.enums import FinancialDocSpecificType # Enum for metadata
# Type hints for simulated data
from typing import List, Dict, Any, IO
from src.services.Sectioner import SectionData
from src.services.ChunkingService import ChunkData


print("Imported necessary modules.")
print("\n--- Imports Complete ---")

Imported necessary modules.

--- Imports Complete ---


In [3]:
# Cell 3: Load Environment Variables and Initialize Authenticated Supabase Client
load_dotenv() # Load variables from .env file in the project root

# Get Supabase credentials from environment variables
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_ANON_KEY") # Use ANON key

# --- Test User Credentials (MUST match the TEST_USER_UID below) ---
test_email = os.environ.get("TEST_EMAIL")
test_password = os.environ.get("TEST_PASSWORD")
# --- Target User UID (The user identified by test_email MUST have this UID) ---
# Using the specific UUID you provided earlier
TEST_USER_UID_STR = "e222921f-cfdc-4a05-8cf2-aea13004bcf2"
TEST_USER_UID = uuid.UUID(TEST_USER_UID_STR) # Convert to UUID object

print(f"SUPABASE_URL Loaded: {'Yes' if supabase_url else 'No'}")
print(f"SUPABASE_ANON_KEY Loaded: {'Yes' if supabase_key else 'No'}")
print(f"TEST_EMAIL Loaded: {'Yes' if test_email else 'No'}")
print(f"TEST_PASSWORD Loaded: {'Yes' if test_password else 'No'}")
print(f"Target TEST_USER_UID: {TEST_USER_UID}")

# Basic validation
if not all([supabase_url, supabase_key, test_email, test_password]):
    print("\nError: Missing required Supabase credentials or test user info in .env file.")
    supabase_client: Client | None = None # Set client to None
else:
    try:
        # Initialize the Supabase client
        print("\nInitializing Supabase client...")
        supabase_client: Client | None = create_client(supabase_url, supabase_key)
        print("Supabase client initialized.")

        # --- Authenticate the client ---
        print(f"\nAttempting to sign in user: {test_email}...")
        response = supabase_client.auth.sign_in_with_password(
            {"email": test_email, "password": test_password}
        )

        if response and response.session and response.user:
            print(f"Sign-in successful for user ID: {response.user.id}")
            # Verify the logged-in user's ID matches the target TEST_USER_UID
            if str(response.user.id) != TEST_USER_UID_STR:
                print(f"CRITICAL ERROR: Logged-in user ID ({response.user.id}) does NOT match target ({TEST_USER_UID}).")
                print("Tests will likely fail due to RLS policies.")
                supabase_client = None # Invalidate client if user mismatch
            else:
                 print("Logged-in user ID matches target TEST_USER_UID.")
        else:
            print("Sign-in failed. Response:", response)
            supabase_client = None

    except Exception as e:
        print(f"\nError initializing Supabase client or signing in: {e}")
        supabase_client = None

print("\n--- Client Initialization and Authentication Complete ---")

# Instantiate the service to test, passing the authenticated client
supabase_service: SupabaseService | None = None
if supabase_client:
    try:
        supabase_service = SupabaseService(supabase_client=supabase_client)
        print("SupabaseService instantiated successfully.")
    except Exception as e:
        print(f"Error instantiating SupabaseService: {e}")
else:
    print("Cannot instantiate SupabaseService because client initialization/authentication failed.")

SUPABASE_URL Loaded: Yes
SUPABASE_ANON_KEY Loaded: Yes
TEST_EMAIL Loaded: Yes
TEST_PASSWORD Loaded: Yes
Target TEST_USER_UID: e222921f-cfdc-4a05-8cf2-aea13004bcf2

Initializing Supabase client...
Supabase client initialized.

Attempting to sign in user: wbryanlai@gmail.com...
Sign-in successful for user ID: e222921f-cfdc-4a05-8cf2-aea13004bcf2
Logged-in user ID matches target TEST_USER_UID.

--- Client Initialization and Authentication Complete ---
SupabaseService initialized with provided client.
SupabaseService instantiated successfully.


In [4]:
# Cell 4: Prepare Test Data (Document Metadata, Sections, Chunks)

# Ensure we proceed only if SupabaseService is ready
if supabase_service:
    print("\n--- Preparing Test Data ---")

    # --- 1. Test PDF Data ---
    # We need a filename and a buffer for the upload test
    TEST_PDF_FILENAME = "test_invoice_for_service.pdf"
    # Create a dummy file buffer in memory for testing upload
    # In a real scenario, this buffer comes from the uploaded file
    dummy_pdf_content = b"%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj 2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<<>>>>endobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000058 00000 n \n0000000111 00000 n \ntrailer<</Size 4/Root 1 0 R>>startxref\n178\n%%EOF"
    test_pdf_buffer = io.BytesIO(dummy_pdf_content)
    print(f"Prepared dummy PDF buffer for filename: {TEST_PDF_FILENAME}")

    # --- 2. Test Document Metadata ---
    test_metadata = FinancialDocumentMetadata(
        doc_specific_type=FinancialDocSpecificType.INVOICE,
        company_name="Test Service Co.",
        report_date="2024-01-15",
        doc_year=2024,
        doc_quarter=-1, # Placeholder for null
        doc_summary="Invoice for services rendered in January 2024."
    )
    print("Prepared sample FinancialDocumentMetadata.")

    # --- 3. Test Section Data (Simulating Sectioner Output) ---
    # We'll need document_id from the save_document_record step later
    # For now, just create the structure
    test_sections: List[SectionData] = [
        {
            "section_heading": "Invoice Header",
            "page_numbers": [1],
            "content_markdown": "# Invoice\n To: Client A\n From: Test Service Co.",
            "section_index": 0,
            # 'document_id' and 'user_id' will be added before saving
        },
        {
            "section_heading": "Line Items",
            "page_numbers": [1],
            "content_markdown": "## Services Rendered\n - Service 1: $100\n - Service 2: $200",
            "section_index": 1,
        }
    ]
    print(f"Prepared {len(test_sections)} sample sections.")

    # --- 4. Test Chunk Data (Simulating Embedding Output) ---
    # We'll need section_id from the save_sections_batch step later
    # Create chunks with dummy embeddings
    test_chunks: List[ChunkData] = [
        {
            # 'section_id', 'document_id', 'user_id' will be added
            "chunk_text": "To: Client A",
            "chunk_index": 0,
            "start_char_index": 11,
            "end_char_index": 23,
            "embedding": [0.1] * 1536, # Dummy embedding vector
            "embedding_model": "text-embedding-3-small",
            "doc_specific_type": "Invoice",
            "doc_year": 2024,
            "doc_quarter": None, # Correctly None after conversion
            "company_name": "Test Service Co.",
            "report_date": "2024-01-15",
            "section_heading": "Invoice Header",
            "metadata": {}
        },
        {
            "chunk_text": "- Service 1: $100",
            "chunk_index": 0,
            "start_char_index": 20,
            "end_char_index": 38,
            "embedding": [0.2] * 1536, # Different dummy vector
            "embedding_model": "text-embedding-3-small",
            "doc_specific_type": "Invoice",
            "doc_year": 2024,
            "doc_quarter": None,
            "company_name": "Test Service Co.",
            "report_date": "2024-01-15",
            "section_heading": "Line Items",
            "metadata": {}
        }
    ]
    print(f"Prepared {len(test_chunks)} sample chunks with dummy embeddings.")
    print("\n--- Test Data Preparation Complete ---")

else:
    print("\nSkipping Test Data Preparation as SupabaseService is not available.")


--- Preparing Test Data ---
Prepared dummy PDF buffer for filename: test_invoice_for_service.pdf
Prepared sample FinancialDocumentMetadata.
Prepared 2 sample sections.
Prepared 2 sample chunks with dummy embeddings.

--- Test Data Preparation Complete ---


In [5]:
# Cell 5: Test SupabaseService Methods Sequentially

# Use these variables to store IDs generated during the test
test_generated_document_id: uuid.UUID | None = None
test_storage_path: str | None = None
test_generated_section_ids: List[uuid.UUID] | None = None

if supabase_service:
    print("\n--- Testing SupabaseService Methods ---")

    # === Test 1: Upload PDF ===
    print("\n--- Test 1: upload_pdf_to_storage ---")
    # Need a temporary document ID for the path, will use the final one later
    temp_doc_id_for_path = uuid.uuid4()
    test_storage_path = supabase_service.upload_pdf_to_storage(
        pdf_file_buffer=test_pdf_buffer,
        user_id=TEST_USER_UID,
        document_id=temp_doc_id_for_path, # Use temp ID for initial upload path test
        original_filename=TEST_PDF_FILENAME
    )
    assert test_storage_path is not None, "Test 1 FAILED: PDF upload returned None."
    assert str(TEST_USER_UID) in test_storage_path, "Test 1 FAILED: User UID not in storage path."
    assert str(temp_doc_id_for_path) in test_storage_path, "Test 1 FAILED: Document ID not in storage path."
    assert TEST_PDF_FILENAME in test_storage_path, "Test 1 FAILED: Filename not in storage path."
    print(f"Test 1 PASSED: PDF upload seems successful. Path: {test_storage_path}")
    # Note: We use a temp ID here because the actual doc ID isn't known until after DB insert.
    # In the real pipeline, you might insert the document record first (status='uploading'),
    # get the ID, then upload using that ID in the path. Or upload first, then update path later.
    # For simplicity here, we accept the temp ID in the path.


    # === Test 2: Save Document Record ===
    print("\n--- Test 2: save_document_record ---")
    test_generated_document_id = supabase_service.save_document_record(
        user_id=TEST_USER_UID,
        filename=TEST_PDF_FILENAME,
        storage_path=test_storage_path, # Use the path returned by upload
        doc_type="pdf",
        metadata=test_metadata
    )
    assert test_generated_document_id is not None, "Test 2 FAILED: Saving document record returned None."
    assert isinstance(test_generated_document_id, uuid.UUID), "Test 2 FAILED: Returned document ID is not a UUID."
    print(f"Test 2 PASSED: Document record saved. Generated Document ID: {test_generated_document_id}")


    # === Test 3: Save Sections Batch ===
    print("\n--- Test 3: save_sections_batch ---")
    if test_generated_document_id:
        # Add the required document_id and user_id to the section data before saving
        for section in test_sections:
            section['document_id'] = test_generated_document_id
            section['user_id'] = TEST_USER_UID

        test_generated_section_ids = supabase_service.save_sections_batch(test_sections)
        assert test_generated_section_ids is not None, "Test 3 FAILED: Saving sections returned None."
        assert isinstance(test_generated_section_ids, list), "Test 3 FAILED: Saving sections did not return a list."
        assert len(test_generated_section_ids) == len(test_sections), \
            f"Test 3 FAILED: Expected {len(test_sections)} section IDs, got {len(test_generated_section_ids)}."
        assert all(isinstance(sid, uuid.UUID) for sid in test_generated_section_ids), \
            "Test 3 FAILED: Not all returned section IDs are UUIDs."
        print(f"Test 3 PASSED: Sections batch saved. Generated Section IDs: {test_generated_section_ids}")
    else:
        print("Skipping Test 3 because document ID was not generated in Test 2.")


    # === Test 4: Save Chunks Batch ===
    print("\n--- Test 4: save_chunks_batch ---")
    if test_generated_document_id and test_generated_section_ids and len(test_generated_section_ids) == len(test_sections):
         # Add the required document_id, user_id, and section_id to the chunk data
         # Assign section IDs based on the order they were created/saved
         test_chunks[0]['section_id'] = test_generated_section_ids[0] # Chunk 0 belongs to Section 0
         test_chunks[1]['section_id'] = test_generated_section_ids[1] # Chunk 1 belongs to Section 1
         for chunk in test_chunks:
             chunk['document_id'] = test_generated_document_id
             chunk['user_id'] = TEST_USER_UID

         save_chunks_success = supabase_service.save_chunks_batch(test_chunks)
         assert save_chunks_success, "Test 4 FAILED: Saving chunks returned False."
         print("Test 4 PASSED: Chunks batch save reported success.")
    else:
         print("Skipping Test 4 because document ID or section IDs were not generated successfully.")


    # === Test 5: Update Document Status ===
    print("\n--- Test 5: update_document_status ---")
    if test_generated_document_id:
        update_success = supabase_service.update_document_status(test_generated_document_id, "completed")
        assert update_success, "Test 5 FAILED: Updating document status returned False."
        print("Test 5 PASSED: Document status update reported success.")
         # You would typically verify the status change with a SELECT query in a real test
    else:
        print("Skipping Test 5 because document ID was not generated.")

    print("\n--- SupabaseService Method Tests Complete ---")

else:
    print("\nSkipping SupabaseService tests as the service could not be instantiated.")


--- Testing SupabaseService Methods ---

--- Test 1: upload_pdf_to_storage ---
Attempting to upload PDF to storage path: e222921f-cfdc-4a05-8cf2-aea13004bcf2/d631920f-676b-4080-b2c3-8783126bb4c7/test_invoice_for_service.pdf
Supabase storage upload response for e222921f-cfdc-4a05-8cf2-aea13004bcf2/d631920f-676b-4080-b2c3-8783126bb4c7/test_invoice_for_service.pdf
PDF successfully uploaded to: e222921f-cfdc-4a05-8cf2-aea13004bcf2/d631920f-676b-4080-b2c3-8783126bb4c7/test_invoice_for_service.pdf
Test 1 PASSED: PDF upload seems successful. Path: e222921f-cfdc-4a05-8cf2-aea13004bcf2/d631920f-676b-4080-b2c3-8783126bb4c7/test_invoice_for_service.pdf

--- Test 2: save_document_record ---
Saving document record for: test_invoice_for_service.pdf (User: e222921f-cfdc-4a05-8cf2-aea13004bcf2)
Document record saved successfully. Document ID: b96f7615-efdd-46c4-87cf-f28c1bd54c45
Test 2 PASSED: Document record saved. Generated Document ID: b96f7615-efdd-46c4-87cf-f28c1bd54c45

--- Test 3: save_section

In [6]:
# Cell 6: (Optional but Recommended) Cleanup - Delete test data from DB and Storage

# IMPORTANT: Use with caution! This will delete the data created by this test.
# Ensure the IDs used for deletion are correct.

print("\n--- Cleanup ---")
if supabase_service and test_generated_document_id:
    print(f"Attempting to delete data associated with Document ID: {test_generated_document_id}")

    # 1. Delete Document Record (CASCADE should handle sections and chunks)
    print("  Deleting document record (cascade should delete sections and chunks)...")
    try:
        response = supabase_service.client.table('documents')\
            .delete()\
            .eq('id', str(test_generated_document_id))\
            .eq('user_id', str(TEST_USER_UID)) \
            .execute()
        # Check if delete likely succeeded (response.data usually contains deleted rows)
        if response.data and len(response.data) > 0:
            print(f"  Successfully deleted document record: {response.data[0].get('id')}")
        elif response.data and len(response.data) == 0:
            print(f"  Document record {test_generated_document_id} not found (already deleted?).")
        else:
             print(f"  Document deletion might have failed. Response: {response}")

    except Exception as e:
        print(f"  Error deleting document record: {e}")

    # 2. Delete Uploaded File from Storage
    if test_storage_path:
        print(f"  Deleting file from storage: {test_storage_path}...")
        try:
            # Reconstruct the path using the *final* document ID if it was used for upload
            # Or use the path stored in test_storage_path (which used a temp ID)
            path_to_delete = test_storage_path # Use the path that was actually used for upload

            delete_response = supabase_service.client.storage.from_(SupabaseService.STORAGE_BUCKET_NAME).remove([path_to_delete])
            if delete_response:
                print("  Storage file deletion successful.")
            else:
                 print("  Storage file deletion might have failed or file not found.")
        except Exception as e:
             print(f"  Error deleting storage file: {e}")
    else:
        print("  Skipping storage file deletion as path is unknown.")

else:
    print("Skipping cleanup as SupabaseService is not available or no document ID was generated.")

print("\n--- Cleanup Complete ---")


--- Cleanup ---
Attempting to delete data associated with Document ID: b96f7615-efdd-46c4-87cf-f28c1bd54c45
  Deleting document record (cascade should delete sections and chunks)...
  Successfully deleted document record: b96f7615-efdd-46c4-87cf-f28c1bd54c45
  Deleting file from storage: e222921f-cfdc-4a05-8cf2-aea13004bcf2/d631920f-676b-4080-b2c3-8783126bb4c7/test_invoice_for_service.pdf...
  Storage file deletion successful.

--- Cleanup Complete ---
