In [1]:
# Cell 1: Setup - Add project root to sys.path and install necessary libraries
# This ensures we can import from the 'src' directory
import sys
import os

# Get the absolute path to the project root directory
# Assuming the notebook is in AI-CFO-FYP/notebooks/
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the project root to the system path if it's not already there
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root added to sys.path:", project_root)
print("Current working directory:", os.getcwd())

# Ensure necessary libraries are installed
# You need 'openai' and 'python-dotenv' for OpenAIClient
# !pip install openai python-dotenv
# Ensure your local src directory is installed if needed for imports
# !pip install -e .

print("\n--- Setup Complete ---")

Project root added to sys.path: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP
Current working directory: c:\Users\wbrya\OneDrive\Documents\GitHub\AI-CFO-FYP\notebooks

--- Setup Complete ---


In [2]:
# Cell 2: Import necessary modules and classes
import uuid # Standard library for UUIDs
from typing import List, Dict, Any # Standard library for type hinting
# Import the ChunkData type (it's just a Dict[str, Any] for now, but good practice)
from src.services.ChunkingService import ChunkData
# Import the EmbeddingService
from src.services.EmbeddingService import EmbeddingService
# Import the OpenAIClient (EmbeddingService depends on it)
from src.llm.OpenAIClient import OpenAIClient

print("Imported ChunkData, EmbeddingService, OpenAIClient.")
print("\n--- Imports Complete ---")

Imported ChunkData, EmbeddingService, OpenAIClient.

--- Imports Complete ---


In [3]:
# Cell 3: Load Environment Variables and Initialize OpenAI Client
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Verify the OpenAI key is loaded
openai_key = os.environ.get("OPENAI_API_KEY")
if openai_key:
    print(f"OpenAI API Key loaded: {openai_key[:3]}...{openai_key[-4:]}")
else:
    print("OpenAI API Key not loaded. Please check your .env file.")
    # This test will fail if the key is not set, as OpenAIClient instantiation will raise error

try:
    # Instantiate the OpenAIClient
    openai_client = OpenAIClient()
    print("\nOpenAIClient initialized successfully.")

except ValueError as e:
    print(f"\nError initializing OpenAIClient: {e}")
    openai_client = None # Set client to None if initialization failed
except Exception as e:
    print(f"\nAn unexpected error occurred during OpenAIClient initialization: {e}")
    openai_client = None

OpenAI API Key loaded: sk-...z-8A
Initialized OpenAI client with model: text-embedding-3-small

OpenAIClient initialized successfully.


In [4]:
# Cell 4: Prepare Sample Chunk Data
# Create sample ChunkData dictionaries, simulating output from ChunkingService
# We need to provide the metadata that EmbeddingService expects to find
# Use explicit IDs for clarity

TEST_DOCUMENT_ID = uuid.uuid4()
TEST_USER_ID = uuid.uuid4()
# Simulate a section ID from the previous step
TEST_SECTION_ID_1 = uuid.uuid4()
TEST_SECTION_ID_2 = uuid.uuid4()


print("Preparing sample chunk data...")

sample_chunks_data: List[ChunkData] = [
    {
        "section_id": TEST_SECTION_ID_1,
        "document_id": TEST_DOCUMENT_ID,
        "user_id": TEST_USER_ID,
        "chunk_text": "Revenue increased by 15% in Q4 2023.",
        "chunk_index": 0,
        "start_char_index": 0,
        "end_char_index": 35,
        "doc_specific_type": "Income Statement",
        "doc_year": 2023,
        "doc_quarter": 4,
        "company_name": "Acme Corp",
        "report_date": "2023-12-31",
        "section_heading": "Revenue Analysis",
        "metadata": {} # Empty metadata for simplicity
    },
    {
        "section_id": TEST_SECTION_ID_1, # Same section as above
        "document_id": TEST_DOCUMENT_ID,
        "user_id": TEST_USER_ID,
        "chunk_text": "Operating expenses were $150 Million.",
        "chunk_index": 1,
        "start_char_index": 36,
        "end_char_index": 70,
        "doc_specific_type": "Income Statement",
        "doc_year": 2023,
        "doc_quarter": 4,
        "company_name": "Acme Corp",
        "report_date": "2023-12-31",
        "section_heading": "Operating Expenses", # Note: Different section heading
        "metadata": {}
    },
     {
        "section_id": TEST_SECTION_ID_2, # Different section
        "document_id": TEST_DOCUMENT_ID, # Same document/user
        "user_id": TEST_USER_ID,
        "chunk_text": "Cash and equivalents totaled $50M as of Dec 31, 2023.",
        "chunk_index": 0,
        "start_char_index": 0,
        "end_char_index": 50,
        "doc_specific_type": "Balance Sheet", # Different doc type
        "doc_year": 2023,
        "doc_quarter": None, # Simulate None for quarter
        "company_name": "Acme Corp",
        "report_date": "2023-12-31",
        "section_heading": "Current Assets",
        "metadata": {}
    },
    # Add a chunk with minimal metadata to test robustness
     {
        "section_id": uuid.uuid4(),
        "document_id": uuid.uuid4(), # Different document/user
        "user_id": uuid.uuid4(),
        "chunk_text": "Additional notes on liabilities.",
        "chunk_index": 0,
        "start_char_index": 0,
        "end_char_index": 30,
        "doc_specific_type": None, # Missing metadata
        "doc_year": None,
        "doc_quarter": None,
        "company_name": "", # Empty string company name
        "report_date": "1900-01-01", # Placeholder date
        "section_heading": None, # Missing section heading
        "metadata": {}
    },
     {
        "section_id": uuid.uuid4(), # Simulating a chunk from a different user/document
        "document_id": uuid.uuid4(),
        "user_id": uuid.uuid4(),
        "chunk_text": "This chunk belongs to a different user.",
        "chunk_index": 0,
        "start_char_index": 0,
        "end_char_index": 35,
        "doc_specific_type": "Invoice",
        "doc_year": 2024,
        "doc_quarter": 1,
        "company_name": "Beta Inc",
        "report_date": "2024-03-31",
        "section_heading": "Summary",
        "metadata": {}
    }
]

print(f"Created {len(sample_chunks_data)} sample chunks.")

print("\n--- Sample Data Prepared ---")

Preparing sample chunk data...
Created 5 sample chunks.

--- Sample Data Prepared ---


In [5]:
# Cell 5: Test EmbeddingService

print("\n--- Testing EmbeddingService ---")

# Proceed only if OpenAI client was initialized successfully
if openai_client:
    try:
        # Instantiate EmbeddingService
        embedding_service = EmbeddingService(openai_client=openai_client)
        print("EmbeddingService instantiated.")

        # Generate embeddings
        # The method modifies the list IN PLACE, but also returns it
        chunks_with_embeddings: List[ChunkData] = embedding_service.generate_embeddings(sample_chunks_data)

        print(f"\nEmbeddingService processed {len(chunks_with_embeddings)} chunks.")

        # --- Verification and Print Sample ---
        print("\n--- Verifying and Showing Sample of Chunks with Embeddings ---")

        assert len(chunks_with_embeddings) == len(sample_chunks_data), \
            f"Expected {len(sample_chunks_data)} chunks back, but got {len(chunks_with_embeddings)}"

        num_chunks_to_show = min(len(chunks_with_embeddings), 5)
        print(f"Showing details for the first {num_chunks_to_show} chunks:")

        for i, chunk in enumerate(chunks_with_embeddings[:num_chunks_to_show]):
            print(f"\n--- Chunk {i+1} ---")

            # 1. Check for embedding key and type
            assert 'embedding' in chunk, f"Chunk {i+1} missing 'embedding' key"
            assert isinstance(chunk['embedding'], list), f"Chunk {i+1} 'embedding' is not a list"
            assert all(isinstance(x, (int, float)) for x in chunk['embedding']), \
                f"Chunk {i+1} 'embedding' list contains non-numeric values"

            # 2. Check embedding dimension (1536 for text-embedding-3-small)
            expected_dimension = 1536
            assert len(chunk['embedding']) == expected_dimension, \
                f"Chunk {i+1} embedding dimension mismatch: Expected {expected_dimension}, Got {len(chunk['embedding'])}"
            print(f"  Embedding found with correct dimension ({expected_dimension}).")

            # 3. Check for embedding_model key and value
            assert 'embedding_model' in chunk, f"Chunk {i+1} missing 'embedding_model' key"
            assert chunk['embedding_model'] == openai_client.embedding_model, \
                f"Chunk {i+1} embedding_model mismatch: Expected '{openai_client.embedding_model}', Got '{chunk['embedding_model']}'"
            print(f"  Embedding model noted: '{chunk['embedding_model']}'.")


            # Print some preview details
            print(f"  Chunk Text Preview: {chunk.get('chunk_text', '')[:100]}...")
            print(f"  Augmented Text Snippet (used for embedding):")
            # Reconstruct the augmented text used for embedding for verification
            augmented_text_preview = (
                f"Document Type: {chunk.get('doc_specific_type', 'Unknown')}. "
                f"Year: {chunk.get('doc_year', 'Unknown')}. "
                f"Quarter: {chunk.get('doc_quarter', 'Unknown')}. "
                f"Company: {chunk.get('company_name', 'Unknown')}. "
                f"Section: {chunk.get('section_heading', 'Unknown Section')}. "
                f"Content: {chunk.get('chunk_text', '')[:100]}..." # Truncate for display
            )
            print(f"  '{augmented_text_preview}'")


        print(f"\n--- Verification of first {num_chunks_to_show} chunks complete. ---")
        print("\nAll assertions in Cell 5 passed.")

    except AssertionError as e:
        print(f"\nTEST FAILED: Assertion Error - {e}")
    except Exception as e:
        print(f"\nTEST FAILED: An unexpected error occurred during EmbeddingService test - {e}")

else:
    print("\nSkipping EmbeddingService test because OpenAIClient failed to initialize.")


--- Testing EmbeddingService ---
Initialized EmbeddingService using model: text-embedding-3-small
EmbeddingService instantiated.
Generating embeddings for 5 chunks using model: text-embedding-3-small...
Prepared 5 augmented texts for embedding.
Successfully added embeddings to 5 chunks.

EmbeddingService processed 5 chunks.

--- Verifying and Showing Sample of Chunks with Embeddings ---
Showing details for the first 5 chunks:

--- Chunk 1 ---
  Embedding found with correct dimension (1536).
  Embedding model noted: 'text-embedding-3-small'.
  Chunk Text Preview: Revenue increased by 15% in Q4 2023....
  Augmented Text Snippet (used for embedding):
  'Document Type: Income Statement. Year: 2023. Quarter: 4. Company: Acme Corp. Section: Revenue Analysis. Content: Revenue increased by 15% in Q4 2023....'

--- Chunk 2 ---
  Embedding found with correct dimension (1536).
  Embedding model noted: 'text-embedding-3-small'.
  Chunk Text Preview: Operating expenses were $150 Million....
  Augm