In [1]:
# Cell 1: Setup - Add project root to sys.path and install necessary libraries
import sys
import os
import io
import uuid
from pathlib import Path # Use pathlib for easier path handling
from dotenv import load_dotenv

# Add project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root added to sys.path:", project_root)
print("Current working directory:", os.getcwd())

# Ensure necessary libraries are installed (re-run if needed)
# !pip install -r ../requirements.txt
# !pip install supabase python-dotenv pydantic google-genai openai pymupdf chonkie
# !pip install -e . # Install local src package

print("\n--- Setup Complete ---")

Project root added to sys.path: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP
Current working directory: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP\notebooks

--- Setup Complete ---


In [2]:
# Cell 2: Import necessary modules and the IngestionPipeline
from supabase import create_client, Client
from src.pipeline import IngestionPipeline # Import the main pipeline class

print("Imported IngestionPipeline and Supabase client.")
print("\n--- Imports Complete ---")

Imported IngestionPipeline and Supabase client.

--- Imports Complete ---


In [3]:
# Cell 3: Load Environment Variables and Authenticate Test User for Supabase Client
load_dotenv() # Load variables from .env file

# Get Supabase credentials
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_ANON_KEY") # Use ANON key

# --- Test User Credentials (MUST match the TEST_USER_UID below) ---
test_email = os.environ.get("TEST_EMAIL")
test_password = os.environ.get("TEST_PASSWORD")
# --- Target User UID ---
TEST_USER_UID_STR = "e222921f-cfdc-4a05-8cf2-aea13004bcf2"
TEST_USER_UID = uuid.UUID(TEST_USER_UID_STR)

print(f"SUPABASE_URL Loaded: {'Yes' if supabase_url else 'No'}")
print(f"SUPABASE_ANON_KEY Loaded: {'Yes' if supabase_key else 'No'}")
print(f"TEST_EMAIL Loaded: {'Yes' if test_email else 'No'}")
print(f"TEST_PASSWORD Loaded: {'Yes' if test_password else 'No'}")
print(f"Target TEST_USER_UID: {TEST_USER_UID}")

# Basic validation
if not all([supabase_url, supabase_key, test_email, test_password]):
    print("\nError: Missing required Supabase credentials or test user info in .env file.")
    supabase_client_authenticated: Client | None = None
else:
    try:
        # Initialize Supabase client
        print("\nInitializing Supabase client...")
        temp_client: Client | None = create_client(supabase_url, supabase_key)
        print("Supabase client initialized.")

        # --- Authenticate the client ---
        print(f"\nAttempting to sign in user: {test_email}...")
        response = temp_client.auth.sign_in_with_password(
            {"email": test_email, "password": test_password}
        )

        if response and response.session and response.user:
            print(f"Sign-in successful for user ID: {response.user.id}")
            if str(response.user.id) == TEST_USER_UID_STR:
                 print("Logged-in user ID matches target TEST_USER_UID.")
                 # Store the authenticated client for the pipeline
                 supabase_client_authenticated = temp_client
            else:
                print(f"CRITICAL ERROR: Logged-in user ID ({response.user.id}) does NOT match target ({TEST_USER_UID}).")
                supabase_client_authenticated = None
        else:
            print("Sign-in failed. Response:", response)
            supabase_client_authenticated = None

    except Exception as e:
        print(f"\nError initializing Supabase client or signing in: {e}")
        supabase_client_authenticated = None

print("\n--- Client Initialization and Authentication Complete ---")

SUPABASE_URL Loaded: Yes
SUPABASE_ANON_KEY Loaded: Yes
TEST_EMAIL Loaded: Yes
TEST_PASSWORD Loaded: Yes
Target TEST_USER_UID: e222921f-cfdc-4a05-8cf2-aea13004bcf2

Initializing Supabase client...
Supabase client initialized.

Attempting to sign in user: wbryanlai@gmail.com...
Sign-in successful for user ID: e222921f-cfdc-4a05-8cf2-aea13004bcf2
Logged-in user ID matches target TEST_USER_UID.

--- Client Initialization and Authentication Complete ---


In [4]:
# Cell 4: Define Test File Path and Run the Ingestion Pipeline

# --- Configuration ---
# Path to the PDF file you want to process, relative to the project root
# Make sure this file exists!
from src.services.SupabaseService import SupabaseService


PDF_RELATIVE_PATH = "data/source_pdfs/10k_tesla_3_pages.pdf"
# --- End Configuration ---

# Variable to store the result of the pipeline run
pipeline_run_result = None
final_document_id = None

# Proceed only if we have an authenticated Supabase client
if supabase_client_authenticated:
    print("\n--- Running the Full Ingestion Pipeline ---")

    # Construct the full path to the PDF file
    pdf_full_path = Path(project_root) / PDF_RELATIVE_PATH
    original_filename = pdf_full_path.name
    doc_type = pdf_full_path.suffix[1:].lower() # Get 'pdf' from '.pdf'

    if not pdf_full_path.is_file():
        print(f"Error: Test PDF file not found at {pdf_full_path}")
    else:
        print(f"Processing file: {original_filename} (Type: {doc_type})")
        print(f"Owned by User ID: {TEST_USER_UID}")

        try:
            # Instantiate the IngestionPipeline
            # We can pass the authenticated client to the SupabaseService instance it creates
            # (or modify pipeline __init__ to accept and pass clients)
            # For now, we rely on SupabaseService creating its own client,
            # which will pick up the auth context set by sign_in_with_password
            # on the shared supabase_py instance state (this is how supabase-py often works).
            # A more robust approach would be explicit client passing.
            print("Instantiating IngestionPipeline...")
            pipeline = IngestionPipeline(
                 # Explicitly pass the authenticated client to SupabaseService if preferred:
                 supabase_service=SupabaseService(supabase_client=supabase_client_authenticated)
                 # Otherwise, rely on the default __init__ behavior.
            )
            print("Pipeline instantiated.")

            # Open the PDF file as a buffer and run the pipeline
            with open(pdf_full_path, "rb") as f:
                pdf_buffer = io.BytesIO(f.read())

                pipeline_run_result = await pipeline.run(
                    pdf_file_buffer=pdf_buffer,
                    user_id=TEST_USER_UID,
                    original_filename=original_filename,
                    doc_type=doc_type
                )

            # Print the final result from the pipeline run
            print("\n--- Pipeline Run Result ---")
            print(f"Success: {pipeline_run_result.get('success')}")
            print(f"Message: {pipeline_run_result.get('message')}")
            final_document_id = pipeline_run_result.get('document_id')
            print(f"Document ID: {final_document_id}")
            if pipeline_run_result.get('success'):
                 print(f"Chunks Generated: {pipeline_run_result.get('chunk_count')}")


        except Exception as e:
            print(f"\nAn unexpected error occurred during pipeline execution: {e}")
            # Print traceback for debugging
            import traceback
            traceback.print_exc()

else:
    print("\nSkipping Pipeline execution because Supabase client authentication failed.")

print("\n--- Pipeline Execution Complete ---")


--- Running the Full Ingestion Pipeline ---
Processing file: 10k_tesla_3_pages.pdf (Type: pdf)
Owned by User ID: e222921f-cfdc-4a05-8cf2-aea13004bcf2
Instantiating IngestionPipeline...
SupabaseService initialized with provided client.
Initializing IngestionPipeline...
Initializing Gemini client with API key: AIz...yQ
Initialized OpenAI client with model: text-embedding-3-small
Initialized Sectioner.
Initializing ChunkingService with RecursiveChunker...


  from .autonotebook import tqdm as notebook_tqdm


RecursiveChunker initialized with chunk_size=2048, min_chars=24
Initialized EmbeddingService using model: text-embedding-3-small
IngestionPipeline initialized with all services.
Pipeline instantiated.

--- Starting Ingestion Pipeline for: 10k_tesla_3_pages.pdf (User: e222921f-cfdc-4a05-8cf2-aea13004bcf2) ---

Step 1: Parsing PDF to Markdown...
PDF has 3 pages
Rendering page 1/3
Rendering page 2/3
Rendering page 3/3
Starting page annotation with max 3 concurrent workers...
Page 1: Annotating (Attempt 1/6)
Page 2: Annotating (Attempt 1/6)
Page 3: Annotating (Attempt 1/6)
Page 2: Annotation successful.
Page 1: Annotation successful.
Page 3: Annotation successful.
Parsing successful (3 pages). Markdown length: 10035

Step 2: Extracting Document Metadata...
Sending text snippet to LLM for structured metadata extraction...
Structured metadata extraction attempted.
Metadata extraction successful.
  Extracted Type: Annual Report
  Extracted Company: Tesla, Inc.

Step 3: Uploading Original PDF 

In [6]:
# Cell 5: (Optional but Recommended) Verification in Supabase

import json # <-- Import json for parsing embedding string

# Verify the results directly in your Supabase Dashboard:
# 1. Go to Storage -> financial-pdfs bucket. Check if a folder matching TEST_USER_UID exists
#    and contains a subfolder matching the final_document_id (if generated) with the PDF inside.
# 2. Go to Table Editor -> documents table. Filter by the final_document_id. Check if the
#    record exists, belongs to the correct user_id, and has the expected status ('completed')
#    and metadata.
# 3. Go to Table Editor -> sections table. Filter by the final_document_id. Check if the
#    expected number of sections were created and linked correctly.
# 4. Go to Table Editor -> chunks table. Filter by the final_document_id. Check if the
#    expected number of chunks were created, linked correctly, and have an 'embedding' column
#    that is not null (verifying embeddings were saved).

# You can also add code here to query Supabase using the client to verify programmatically.

if supabase_client_authenticated and final_document_id:
    print("\n--- Programmatic Verification (Basic) ---")
    try:
        # Check document record
        doc_response = supabase_client_authenticated.table("documents")\
            .select("id, status, filename, doc_specific_type")\
            .eq("id", str(final_document_id))\
            .eq("user_id", str(TEST_USER_UID))\
            .maybe_single()\
            .execute()

        if doc_response.data:
            print(f"Document Record Verification:")
            print(f"  Found Document: {doc_response.data.get('id')}")
            print(f"  Status: {doc_response.data.get('status')}")
            print(f"  Filename: {doc_response.data.get('filename')}")
            print(f"  Type: {doc_response.data.get('doc_specific_type')}")
            assert doc_response.data.get('status') == 'completed', f"Document status is not 'completed'! Found: '{doc_response.data.get('status')}'" # Added detail to assertion
        else:
            print(f"Verification FAILED: Document record not found for ID: {final_document_id}")
            # Raise assertion here if document must exist
            assert False, f"Document record verification failed: Record not found for ID {final_document_id}"


        # Check chunk count
        # Note: RLS applies, so this counts only chunks for the authenticated user AND this document
        chunk_count_response = supabase_client_authenticated.table("chunks")\
            .select("id", count="exact")\
            .eq("document_id", str(final_document_id))\
            .execute()

        db_chunk_count = chunk_count_response.count
        print(f"\nChunk Verification:")
        print(f"  Count of chunks found in DB for this document: {db_chunk_count}")
        # Compare with the count returned by the pipeline if available
        pipeline_chunk_count = pipeline_run_result.get('chunk_count')
        if pipeline_chunk_count is not None:
             assert db_chunk_count == pipeline_chunk_count, \
                 f"Chunk count mismatch! Pipeline reported {pipeline_chunk_count}, DB has {db_chunk_count}."
             print(f"  Matches chunk count reported by pipeline ({pipeline_chunk_count}).")
        else:
             print("  Pipeline result did not report chunk count for comparison.")


        # Check first chunk embedding (ensure it's not null and is a list after potential parsing)
        first_chunk_response = supabase_client_authenticated.table("chunks")\
             .select("id, embedding")\
             .eq("document_id", str(final_document_id))\
             .order("chunk_index")\
             .limit(1)\
             .maybe_single()\
             .execute()

        if first_chunk_response.data:
             print("\nFirst Chunk Embedding Verification:")
             print(f"  First Chunk ID: {first_chunk_response.data.get('id')}")
             embedding_value_raw = first_chunk_response.data.get('embedding') # Get the raw value

             # --- MODIFICATION START: Parse embedding string if needed ---
             assert embedding_value_raw is not None, "Embedding value is NULL!"

             parsed_embedding_list = None
             if isinstance(embedding_value_raw, str):
                 # Attempt to parse the string representation '[num, num, ...]'
                 try:
                     # Use json.loads as it can handle list string format
                     parsed_embedding_list = json.loads(embedding_value_raw)
                     print(f"  Raw embedding was string, parsed to list.")
                 except json.JSONDecodeError:
                     print(f"  Error: Could not parse embedding string: {embedding_value_raw[:100]}...")
                     assert False, "Failed to parse embedding string from database." # Fail assertion
             elif isinstance(embedding_value_raw, list):
                 # If it's already a list (less likely but possible with future library updates)
                 parsed_embedding_list = embedding_value_raw
                 print(f"  Raw embedding was already a list.")
             else:
                  assert False, f"Embedding value has unexpected type: {type(embedding_value_raw)}" # Fail if not string or list

             assert isinstance(parsed_embedding_list, list), "Parsed embedding value is not a list!"
             # --- MODIFICATION END ---

             print(f"  Embedding exists and is a list (length: {len(parsed_embedding_list)}).")
             # Optional: Further check content if needed
             # assert all(isinstance(x, (int, float)) for x in parsed_embedding_list), \
             #    "Parsed embedding list contains non-numeric values"

        elif db_chunk_count > 0: # Only fail if chunks existed but query failed
             print("  Verification FAILED: Could not retrieve first chunk to verify embedding.")
             assert False, "Could not retrieve first chunk."
        else: # No chunks existed in the first place
             print("  No chunks found to verify embedding (expected, as chunk count is 0).")


        print("\n--- Programmatic Verification Complete ---")

    except AssertionError as e:
         print(f"\nVERIFICATION FAILED: Assertion Error - {e}")
    except Exception as e:
         print(f"\nAn error occurred during programmatic verification: {e}")
         # Print traceback for debugging other errors
         import traceback
         traceback.print_exc()


else:
    print("\nSkipping programmatic verification as client/document ID is not available.")


--- Programmatic Verification (Basic) ---
Document Record Verification:
  Found Document: a5960ced-b9f0-434b-bfe9-bdf097e0e0b0
  Status: completed
  Filename: 10k_tesla_3_pages.pdf
  Type: Annual Report

Chunk Verification:
  Count of chunks found in DB for this document: 9
  Matches chunk count reported by pipeline (9).

First Chunk Embedding Verification:
  First Chunk ID: f2460100-f894-431d-af2e-fee32f998eb5
  Raw embedding was string, parsed to list.
  Embedding exists and is a list (length: 1536).

--- Programmatic Verification Complete ---


In [7]:
# Cell 6: (Optional but Recommended) Cleanup - Delete test data created by this run

# Import SupabaseService if not already imported in this scope (needed for BUCKET_NAME)
# If Cell 4 imported it, this is technically redundant but safe
from src.services.SupabaseService import SupabaseService

print("\n--- Cleanup ---")
# Use the final_document_id obtained from the pipeline run result for cleanup
if supabase_client_authenticated and final_document_id:
    print(f"Attempting cleanup for Document ID: {final_document_id}")

    # Instantiate SupabaseService again, passing the authenticated client for cleanup actions
    # Although we use the client directly below, instantiating the service is fine too
    cleanup_service = SupabaseService(supabase_client=supabase_client_authenticated)

    # --- Step 1: Retrieve the Storage Path BEFORE deleting the document ---
    retrieved_storage_path: str | None = None
    print("  Retrieving storage path from document record...")
    try:
        doc_path_response = cleanup_service.client.table('documents')\
            .select("storage_path")\
            .eq('id', str(final_document_id))\
            .eq('user_id', str(TEST_USER_UID))\
            .maybe_single()\
            .execute()

        if doc_path_response.data and doc_path_response.data.get('storage_path'):
            retrieved_storage_path = doc_path_response.data['storage_path']
            print(f"  Found storage path: {retrieved_storage_path}")
        else:
            print("  Could not retrieve storage path from document record (might be already deleted or missing).")

    except Exception as e:
        print(f"  Error retrieving storage path: {e}")


    # --- Step 2: Delete Uploaded File from Storage using the RETRIEVED path ---
    if retrieved_storage_path:
        print(f"  Deleting file from storage using retrieved path: {retrieved_storage_path}...")
        try:
            # Use the path retrieved from the database record
            delete_response = cleanup_service.client.storage.from_(SupabaseService.STORAGE_BUCKET_NAME).remove([retrieved_storage_path])
            if delete_response: # Check if response indicates items were processed
                print("  Storage file deletion attempt processed.")
                # supabase-py remove often returns a list of deleted items
                deleted_successfully = False
                for item in delete_response:
                     # Check based on path structure if possible, or just confirm deletion reported
                     print(f"    Deleted item name from response: {item.get('name')}")
                     # A more robust check might involve parsing the path from the response if needed
                     deleted_successfully = True # Assume success if response has items
                if deleted_successfully:
                     print("  Storage file likely deleted successfully.")
                else:
                     print("  Storage file deletion response was empty or did not contain expected item.")

            else:
                 print("  Storage file deletion response was empty (might mean file not found or already deleted).")
        except Exception as e:
             print(f"  Error deleting storage file: {e}")
             if "Object not found" in str(e):
                  print("  Hint: File was likely already deleted.")
    else:
        print("  Skipping storage file deletion as path could not be retrieved.")


    # --- Step 3: Delete Document Record (CASCADE should handle sections and chunks) ---
    print("  Deleting document record from database...")
    try:
        # Use the client directly for simplicity in cleanup verification
        response = cleanup_service.client.table('documents')\
            .delete()\
            .eq('id', str(final_document_id))\
            .eq('user_id', str(TEST_USER_UID)) \
            .execute()
        # Check if delete likely succeeded (response.data usually contains deleted rows)
        if response.data and len(response.data) > 0:
            print(f"  Successfully deleted document record: {response.data[0].get('id')}")
        elif response.data and len(response.data) == 0:
            print(f"  Document record {final_document_id} not found (already deleted?).")
        else:
             print(f"  Document deletion might have failed. Response: {response}") # Print full response for debug

    except Exception as e:
        print(f"  Error deleting document record: {e}")


else:
    print("Skipping cleanup as Supabase client or final document ID is not available.")

print("\n--- Cleanup Complete ---")

# Optional: Sign out the user
# if supabase_client_authenticated:
#    print("\nSigning out test user...")
#    supabase_client_authenticated.auth.sign_out()
#    print("User signed out.")


--- Cleanup ---
Attempting cleanup for Document ID: a5960ced-b9f0-434b-bfe9-bdf097e0e0b0
SupabaseService initialized with provided client.
  Retrieving storage path from document record...
  Found storage path: e222921f-cfdc-4a05-8cf2-aea13004bcf2/ed6ee9c9-f40b-4090-a98c-906ecea74d0c/10k_tesla_3_pages.pdf
  Deleting file from storage using retrieved path: e222921f-cfdc-4a05-8cf2-aea13004bcf2/ed6ee9c9-f40b-4090-a98c-906ecea74d0c/10k_tesla_3_pages.pdf...
  Storage file deletion attempt processed.
    Deleted item name from response: e222921f-cfdc-4a05-8cf2-aea13004bcf2/ed6ee9c9-f40b-4090-a98c-906ecea74d0c/10k_tesla_3_pages.pdf
  Storage file likely deleted successfully.
  Deleting document record from database...
  Successfully deleted document record: a5960ced-b9f0-434b-bfe9-bdf097e0e0b0

--- Cleanup Complete ---
