# End-to-End Ingestion Test

This notebook demonstrates the parsing of a Docling JSON file, transformation into RAG primitives (`StructureNode`, `ContentAtom`), and insertion into a Postgres database with `pgvector`.

In [None]:
import os
import sys
import uuid

# Add root to path so we can import 'ingest'
sys.path.append(os.path.abspath(".."))

from ingest.pipeline import run_ingestion
from ingest.db import get_db_connection, ensure_schema

In [None]:
# 1. Setup Database Schema
# Ensure the DB is running (e.g., docker-compose up -d db)
try:
    conn = get_db_connection()
    ensure_schema(conn)
    print("Schema ensured.")
    conn.close()
except Exception as e:
    print(f"Could not connect to DB: {e}")
    print("Make sure Postgres is running.")

In [None]:
# 2. Run Ingestion for 'Green Line' (Real Data)
book_id_1 = uuid.uuid4()
json_path = "../data/toy_green_line_1_docling.json"

if os.path.exists(json_path):
    run_ingestion(json_path, book_id=book_id_1, should_mock_embedding=True)
else:
    print(f"File not found: {json_path}")

In [None]:
# 3. Run Ingestion for 'Mock Book' (Test Partitioning)
# We will reuse the same JSON but treat it as a different book ID to test partitioning logic.
book_id_2 = uuid.uuid4()
print(f"Ingesting second book with ID: {book_id_2}")

if os.path.exists(json_path):
    run_ingestion(json_path, book_id=book_id_2, should_mock_embedding=True)

In [None]:
# 4. Verify Data in DB
try:
    conn = get_db_connection()
    cur = conn.cursor()
    
    # Check Structure Nodes
    cur.execute("SELECT count(*) FROM structure_nodes")
    nodes_count = cur.fetchone()[0]
    
    # Check Content Atoms
    cur.execute("SELECT count(*) FROM content_atoms")
    atoms_count = cur.fetchone()[0]
    
    print(f"Total Structure Nodes: {nodes_count}")
    print(f"Total Content Atoms: {atoms_count}")
    
    # Check Partitioning (should see tables like content_book_...)
    cur.execute("""
        SELECT tablename FROM pg_tables 
        WHERE tablename LIKE 'content_book_%'
    """)
    partitions = cur.fetchall()
    print("Partitions found:", [p[0] for p in partitions])
    
    conn.close()
except Exception as e:
    print(e)