# Postgres Query Playground

Use this notebook to explore the Haven database. Includes helper functions for common queries and easy execution of custom SQL.

> **Note**: Ensure your database is running and accessible. Default connection: `postgresql://postgres:postgres@localhost:5432/haven`
> Override with `DATABASE_URL` environment variable if needed.


In [11]:
from __future__ import annotations

import json
import os
import sys
from datetime import date, datetime, time
from pathlib import Path
from typing import Any, Dict, List, Optional
from uuid import UUID

import pandas as pd
from IPython.display import display, HTML

# ---------------------------------------------------------------------------
# Ensure the Haven project root (and src/) are importable
# ---------------------------------------------------------------------------
def resolve_project_root() -> Path:
    env_root = Path(os.getenv("HAVEN_PROJECT_ROOT", "")).expanduser()
    if env_root and (env_root / "src" / "haven").exists():
        return env_root

    cwd = Path.cwd().resolve()
    if (cwd / "src" / "haven").exists():
        return cwd

    if (cwd.parent / "src" / "haven").exists():
        return cwd.parent

    raise RuntimeError(
        "Unable to locate Haven project root. Set HAVEN_PROJECT_ROOT or launch the notebook from the repo root."
    )


PROJECT_ROOT = resolve_project_root()
SRC_PATH = PROJECT_ROOT / "src"
for candidate in (PROJECT_ROOT, SRC_PATH):
    path_str = str(candidate)
    if path_str not in sys.path:
        sys.path.insert(0, path_str)

# ---------------------------------------------------------------------------
# Database connection setup
# ---------------------------------------------------------------------------
# Override DATABASE_URL to use localhost when running locally (outside Docker)
# The default connection string uses 'postgres' hostname which only works inside Docker
if "DATABASE_URL" not in os.environ:
    # Default to localhost for local development
    os.environ["DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/haven"
elif "@postgres:" in os.environ.get("DATABASE_URL", ""):
    # If DATABASE_URL uses 'postgres' hostname, replace with 'localhost' for local notebook usage
    os.environ["DATABASE_URL"] = os.environ["DATABASE_URL"].replace("@postgres:", "@localhost:")

from shared.db import get_connection, get_cursor, get_conn_str
from shared.logging import setup_logging
from psycopg.rows import dict_row

setup_logging()

print(f"Project root: {PROJECT_ROOT}")
print(f"Database: {get_conn_str()}")


Project root: /Users/chrispatten/workspace/haven
Database: postgresql://postgres:postgres@localhost:5432/haven


In [12]:
# ---------------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------------

def query(sql: str, params: tuple = ()) -> List[Dict[str, Any]]:
    """Execute a SQL query and return results as a list of dictionaries."""
    with get_connection() as conn:
        with conn.cursor(row_factory=dict_row) as cur:
            cur.execute(sql, params)
            return cur.fetchall()


def query_df(sql: str, params: tuple = ()) -> pd.DataFrame:
    """Execute a SQL query and return results as a pandas DataFrame."""
    results = query(sql, params)
    return pd.DataFrame(results)


def query_one(sql: str, params: tuple = ()) -> Optional[Dict[str, Any]]:
    """Execute a SQL query and return the first row as a dictionary, or None."""
    results = query(sql, params)
    return results[0] if results else None


def execute(sql: str, params: tuple = ()) -> int:
    """Execute a SQL statement (INSERT/UPDATE/DELETE) and return rowcount."""
    with get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute(sql, params)
            return cur.rowcount


def json_dumps_record(record: Any, indent: Optional[int] = None) -> str:
    """
    Serialize a database record (dict, list, etc.) to JSON, handling non-serializable types.
    
    Handles:
    - UUID objects -> string
    - datetime objects -> ISO format string
    - date objects -> ISO format string
    - time objects -> ISO format string
    - Decimal objects -> float
    - Nested dictionaries and lists
    
    Args:
        record: The record to serialize (dict, list, or any value)
        indent: Optional indentation level for pretty printing (default: None)
    
    Returns:
        JSON string representation of the record
    
    Example:
        >>> doc = query_one("SELECT * FROM documents WHERE doc_id = %s", (doc_id,))
        >>> print(json_dumps_record(doc, indent=2))
    """
    def json_serializer(obj: Any) -> Any:
        """Custom JSON serializer for non-standard types."""
        if isinstance(obj, UUID):
            return str(obj)
        elif isinstance(obj, datetime):
            return obj.isoformat()
        elif isinstance(obj, date):
            return obj.isoformat()
        elif isinstance(obj, time):
            return obj.isoformat()
        elif hasattr(obj, '__dict__'):
            # Handle custom objects by converting to dict
            return obj.__dict__
        elif hasattr(obj, '__float__'):
            # Handle Decimal and other numeric types
            try:
                return float(obj)
            except (ValueError, TypeError):
                return str(obj)
        else:
            # Fallback to string representation
            return str(obj)
    
    return json.dumps(record, indent=indent, default=json_serializer, ensure_ascii=False)


print("Helper functions loaded: query(), query_df(), query_one(), execute(), json_dumps_record()")


Helper functions loaded: query(), query_df(), query_one(), execute(), json_dumps_record()


## Database Overview


In [13]:
# Get table counts
tables = [
    "documents",
    "threads",
    "chunks",
    "files",
    "intent_signals",
    "people",
    "crm_relationships",
]

counts = {}
for table in tables:
    try:
        result = query_one(f"SELECT COUNT(*) as count FROM {table}")
        counts[table] = result["count"] if result else 0
    except Exception as e:
        counts[table] = f"Error: {e}"

print("Table counts:")
for table, count in counts.items():
    print(f"  {table:25} {count}")


Table counts:
  documents                 51427
  threads                   324
  chunks                    51451
  files                     0
  intent_signals            0
  people                    472
  crm_relationships         0


## Documents


In [14]:
# Recent documents
recent_docs = query_df("""
    SELECT 
        doc_id,
        external_id,
        source_type,
        title,
        LEFT(text, 100) as text_preview,
        content_timestamp,
        thread_id,
        status,
        intent_status,
        created_at
    FROM documents
    WHERE is_active_version = true
    ORDER BY created_at DESC
    LIMIT 20
""")

display(recent_docs)


Unnamed: 0,doc_id,external_id,source_type,title,text_preview,content_timestamp,thread_id,status,intent_status,created_at
0,23d4ac65-0771-4904-859c-5adf6a3c18a1,B7D501E3-BD6B-4DCD-9B6C-E7ED1DAD5D00,macos_reminders,Change my contacts,Change my contacts,2025-11-08 16:47:05+00:00,,indexed,pending,2025-11-10 17:35:54.425456+00:00
1,306e1a32-c6ed-4831-92af-0e80920ff197,E3C316BF-FD5C-4C94-833F-135074237636,macos_reminders,Frequently Asked Questions | Cape Cod Central ...,Frequently Asked Questions | Cape Cod Central ...,2025-10-30 00:05:22+00:00,,indexed,pending,2025-11-10 17:35:54.410341+00:00
2,5091db88-4b21-48fe-900b-33897a7cf4b1,AB385A7B-2494-412C-81E6-BA6D48B5B531,macos_reminders,Plant Dahlia's,Plant Dahlia's,2025-10-18 18:36:54+00:00,,indexed,pending,2025-11-10 17:35:54.402642+00:00
3,fe84e39a-52b9-4c86-bff5-56a1b64ef669,16FB75EF-3801-4EE8-AB2C-463799A08D2F,macos_reminders,Wish Heather a happy anniversary,Wish Heather a happy anniversary,2025-04-04 11:59:41+00:00,,indexed,pending,2025-11-10 17:35:54.393705+00:00
4,08cf9b74-d5ad-4e13-ba85-37385deaa0b5,5258A9B1-9C18-4345-B724-C30B693615F1,macos_reminders,Offer Jenna food,Offer Jenna food,2025-11-10 14:42:09+00:00,,indexed,pending,2025-11-10 17:35:54.386322+00:00
5,93631d7a-9531-4da1-8b67-cebf255fec95,8BC29936-F285-452F-9418-DEF45DF4BD7E,macos_reminders,Get a Reesie's heart,Get a Reesie's heart,2025-03-10 01:54:06+00:00,,indexed,pending,2025-11-10 17:35:54.377827+00:00
6,4a957da7-1a56-4373-bd74-fddf93f3a712,D88D0767-34A5-47FB-AFA7-07CE5B9B13C1,macos_reminders,Give Johnny his medicine,Give Johnny his medicine,2025-11-10 14:42:08+00:00,,indexed,pending,2025-11-10 17:35:54.369291+00:00
7,22924071-5936-4914-9255-1b414b9a932c,03A229AE-A6FB-4A0B-B3D1-C9513910F0DF,macos_reminders,Flowers,Flowers,2023-05-05 00:14:22+00:00,,indexed,pending,2025-11-10 17:35:54.349001+00:00
8,77bb2798-ac42-4a06-8938-efac43807975,2F5CED66-2469-4A5C-84BA-E997D477F5E0,macos_reminders,Homemade cards,Homemade cards,2023-05-05 00:14:22+00:00,,indexed,pending,2025-11-10 17:35:54.328814+00:00
9,a24693dc-b70d-47a7-a37b-d21eeac0547e,ED4E420F-DF11-4BD5-8526-92E7C78B6D6C,macos_reminders,Birthday song,Birthday song,2023-05-05 00:14:22+00:00,,indexed,pending,2025-11-10 17:35:54.311903+00:00


In [15]:
# Documents by source type
by_source = query_df("""
    SELECT 
        source_type,
        COUNT(*) as count,
        COUNT(DISTINCT thread_id) as thread_count
    FROM documents
    WHERE is_active_version = true
    GROUP BY source_type
    ORDER BY count DESC
""")

display(by_source)


Unnamed: 0,source_type,count,thread_count
0,imessage,50913,324
1,contact,498,0
2,macos_reminders,16,0


In [16]:
# Documents with intents
docs_with_intents = query_df("""
    SELECT 
        doc_id,
        external_id,
        source_type,
        LEFT(text, 150) as text_preview,
        intent,
        intent_status,
        content_timestamp
    FROM documents
    WHERE is_active_version = true
      AND intent IS NOT NULL
    ORDER BY content_timestamp DESC
    LIMIT 20
""")

display(docs_with_intents)


In [17]:
# Find a specific document by external_id
external_id = "325FC66C-6B6E-4447-9979-A6CFB21E30C5"  # Change this

doc = query_one("""
    SELECT *
    FROM documents
    WHERE external_id = %s
      AND is_active_version = true
    ORDER BY version_number DESC
    LIMIT 1
""", (external_id,))

if doc:
    print(f"Found document: {doc['doc_id']}")
    print(f"Text: {doc['text'][:200]}...")
    print(f"Intent: {json.dumps(doc.get('intent'), indent=2) if doc.get('intent') else None}")
    print(json_dumps_record(doc, indent=2))
else:
    print(f"No document found with external_id: {external_id}")


Found document: 3b747180-0dc7-4704-83a2-4d463d05e1d0
Text: Getting naked...
Intent: None
{
  "doc_id": "3b747180-0dc7-4704-83a2-4d463d05e1d0",
  "external_id": "325FC66C-6B6E-4447-9979-A6CFB21E30C5",
  "source_type": "macos_reminders",
  "source_provider": null,
  "version_number": 1,
  "previous_version_id": null,
  "is_active_version": true,
  "superseded_at": null,
  "superseded_by_id": null,
  "title": "Getting naked",
  "text": "Getting naked",
  "text_sha256": "857620cd2ed468f2c22432ef45f355e1bc1eba532d0d72b5810b767e4765b313",
  "mime_type": "text/plain",
  "canonical_uri": "reminder://325FC66C-6B6E-4447-9979-A6CFB21E30C5",
  "content_timestamp": "2019-11-03T23:23:44+00:00",
  "content_timestamp_type": "modified",
  "content_created_at": null,
  "content_modified_at": null,
  "people": [],
  "thread_id": null,
  "parent_doc_id": null,
  "source_doc_ids": [],
  "related_doc_ids": [],
  "has_attachments": false,
  "attachment_count": 0,
  "has_location": false,
  "has_due_date": fa

## Threads


In [None]:
# Recent threads
recent_threads = query_df("""
    SELECT 
        thread_id,
        external_id,
        source_type,
        title,
        participant_count,
        first_message_at,
        last_message_at,
        (SELECT COUNT(*) FROM documents WHERE thread_id = t.thread_id AND is_active_version = true) as doc_count
    FROM threads t
    ORDER BY last_message_at DESC NULLS LAST
    LIMIT 20
""")

display(recent_threads)


In [None]:
# Get thread context for a document (useful for testing intent classification)
doc_id = "your-doc-id-here"  # Change this to a UUID

thread_context = query("""
    SELECT 
        doc_id,
        text,
        content_timestamp,
        metadata->>'sender' as sender,
        metadata->>'from' as from_field,
        people
    FROM documents
    WHERE thread_id = (
        SELECT thread_id FROM documents WHERE doc_id = %s::uuid
    )
      AND is_active_version = true
    ORDER BY content_timestamp ASC
    LIMIT 10
""", (doc_id,))

print(f"Thread context ({len(thread_context)} messages):")
for i, msg in enumerate(thread_context, 1):
    sender = msg.get('sender') or msg.get('from_field') or 'unknown'
    text_preview = msg['text'][:100] if msg['text'] else ''
    print(f"\n{i}. [{sender}] {text_preview}...")


## Intent Signals


In [None]:
# Recent intent signals
recent_signals = query_df("""
    SELECT 
        signal_id,
        artifact_id,
        taxonomy_version,
        signal_data->>'intent_name' as intent_name,
        signal_data->'slots' as slots,
        status,
        conflict,
        created_at
    FROM intent_signals
    ORDER BY created_at DESC
    LIMIT 20
""")

display(recent_signals)


In [None]:
# Intent signals by intent type
signals_by_intent = query_df("""
    SELECT 
        signal_data->>'intent_name' as intent_name,
        COUNT(*) as count,
        COUNT(*) FILTER (WHERE conflict = true) as conflict_count
    FROM intent_signals
    WHERE signal_data->>'intent_name' IS NOT NULL
    GROUP BY signal_data->>'intent_name'
    ORDER BY count DESC
""")

display(signals_by_intent)


## Intent Processing Status


In [None]:
# Documents pending intent processing
pending_intents = query_df("""
    SELECT 
        doc_id,
        external_id,
        source_type,
        LEFT(text, 100) as text_preview,
        intent_status,
        content_timestamp
    FROM documents
    WHERE is_active_version = true
      AND intent_status = 'pending'
    ORDER BY content_timestamp DESC
    LIMIT 20
""")

display(pending_intents)


In [None]:
# Intent processing status summary
intent_status_summary = query_df("""
    SELECT 
        intent_status,
        COUNT(*) as count,
        COUNT(*) FILTER (WHERE intent IS NOT NULL) as has_intent_count
    FROM documents
    WHERE is_active_version = true
    GROUP BY intent_status
    ORDER BY count DESC
""")

display(intent_status_summary)


In [None]:
# Documents with intent processing errors
failed_intents = query_df("""
    SELECT 
        doc_id,
        external_id,
        source_type,
        LEFT(text, 150) as text_preview,
        intent_status,
        intent_processing_error,
        content_timestamp
    FROM documents
    WHERE is_active_version = true
      AND intent_status = 'failed'
    ORDER BY content_timestamp DESC
    LIMIT 20
""")

display(failed_intents)


## Custom Queries


In [14]:
# Write your own SQL query here
custom_sql = """
    SELECT 
        doc_id,
        external_id,
        source_type,
        LEFT(text, 200) as text_preview
    FROM documents
    WHERE is_active_version = true
        AND source_type = 'macos_reminders'
    LIMIT 10
"""

results = query_df(custom_sql)
display(results)


Unnamed: 0,doc_id,external_id,source_type,text_preview
0,8bd92469-1b9b-4e07-9ab0-2aaf0de9c700,70E46C66-83A6-48F5-BBC9-4CEA9726760E,macos_reminders,Robot maintenance
1,27dd79e8-6c12-4e6d-a827-cb2677faebd5,8FEC24A7-145D-47C0-8F0E-2D54CDBFCBD5,macos_reminders,Change my contacts
2,3b747180-0dc7-4704-83a2-4d463d05e1d0,325FC66C-6B6E-4447-9979-A6CFB21E30C5,macos_reminders,Getting naked
3,133d36a2-9886-4656-abe0-bf966459ea54,2F1784D7-17FC-4738-BC14-C60D7CB4479D,macos_reminders,Pay the electric bill
4,5802f2ac-560c-44dd-83a7-467e08d195da,A2C05617-65D6-409E-8E31-D06FA7FF3F46,macos_reminders,K birthday
5,c05a5c7c-8fb9-4631-9d03-8f15d1189ab8,88E32D76-D983-4131-A510-3E35FB74D5B4,macos_reminders,Cake
6,a24693dc-b70d-47a7-a37b-d21eeac0547e,ED4E420F-DF11-4BD5-8526-92E7C78B6D6C,macos_reminders,Birthday song
7,77bb2798-ac42-4a06-8938-efac43807975,2F5CED66-2469-4A5C-84BA-E997D477F5E0,macos_reminders,Homemade cards
8,22924071-5936-4914-9255-1b414b9a932c,03A229AE-A6FB-4A0B-B3D1-C9513910F0DF,macos_reminders,Flowers
9,4a957da7-1a56-4373-bd74-fddf93f3a712,D88D0767-34A5-47FB-AFA7-07CE5B9B13C1,macos_reminders,Give Johnny his medicine


In [None]:
# Example: Find documents with specific text pattern
search_text = "%eggs%"  # SQL LIKE pattern

matching_docs = query_df("""
    SELECT 
        doc_id,
        external_id,
        source_type,
        text,
        intent,
        content_timestamp
    FROM documents
    WHERE is_active_version = true
      AND text ILIKE %s
    ORDER BY content_timestamp DESC
    LIMIT 10
""", (search_text,))

display(matching_docs)


## People & Relationships


In [None]:
# People in the database
people_list = query_df("""
    SELECT 
        person_id,
        display_name,
        normalized_identifiers,
        created_at
    FROM people
    ORDER BY created_at DESC
    LIMIT 20
""")

display(people_list)


In [None]:
# Top relationships
top_relationships = query_df("""
    SELECT 
        r.relationship_id,
        p1.display_name as self_name,
        p2.display_name as contact_name,
        r.score,
        r.last_contact_at,
        r.decay_bucket
    FROM crm_relationships r
    JOIN people p1 ON r.self_person_id = p1.person_id
    JOIN people p2 ON r.person_id = p2.person_id
    ORDER BY r.score DESC
    LIMIT 20
""")

display(top_relationships)


## Chunks & Embeddings


In [None]:
# Chunk embedding status
chunk_status = query_df("""
    SELECT 
        embedding_status,
        COUNT(*) as count,
        COUNT(*) FILTER (WHERE embedding_vector IS NOT NULL) as has_vector_count
    FROM chunks
    GROUP BY embedding_status
    ORDER BY count DESC
""")

display(chunk_status)


In [None]:
# Documents with chunks
docs_with_chunks = query_df("""
    SELECT 
        d.doc_id,
        d.external_id,
        d.source_type,
        COUNT(cd.chunk_id) as chunk_count
    FROM documents d
    LEFT JOIN chunk_documents cd ON d.doc_id = cd.doc_id
    WHERE d.is_active_version = true
    GROUP BY d.doc_id, d.external_id, d.source_type
    HAVING COUNT(cd.chunk_id) > 0
    ORDER BY chunk_count DESC
    LIMIT 20
""")

display(docs_with_chunks)
