In [None]:
from __future__ import annotations

import os
from pathlib import Path

from symai import Symbol
from symai.backend.engines.index.engine_qdrant import QdrantIndexEngine
from symai.interfaces import Interface

In [None]:
# Config
PDF_DIR = Path("/Users/ryang/Work/ExtensityAI/agentic-flows/testfiles")
PDF_PATHS = [
    PDF_DIR / "symbolicai.pdf",
    PDF_DIR / "radt.pdf",
    PDF_DIR / "llms_can_selfimprove.pdf",
]

COLLECTION_NAME = "demo_collection"
VECTOR_SIZE = 1536

engine = QdrantIndexEngine(
    url="http://localhost:6333",
    api_key=None,
    index_name=COLLECTION_NAME,
    index_dims=VECTOR_SIZE,
    index_top_k=5,
    index_metric="Cosine",
)

In [None]:
# Create Collection
await engine.create_collection(
    collection_name=COLLECTION_NAME,
    vector_size=VECTOR_SIZE,
    distance="Cosine",
)

In [None]:
# Chunk and Upsert PDF Files
# Note: chunk provenance (start/end line) is stored by default.
num_chunks_0 = await engine.chunk_and_upsert(
    collection_name=COLLECTION_NAME,
    document_path=str(PDF_PATHS[0]),
    include_line_numbers=True,
    metadata={
        "source": str(PDF_PATHS[0].resolve()),
        "filename": PDF_PATHS[0].name,
        "tags": ["symbolicai", "pdf", "code"],
    },
)

num_chunks_1 = await engine.chunk_and_upsert(
    collection_name=COLLECTION_NAME,
    document_path=str(PDF_PATHS[1]),
    include_line_numbers=True,
    metadata={
        "source": str(PDF_PATHS[1].resolve()),
        "filename": PDF_PATHS[1].name,
        "tags": ["radt", "pdf", "llm"],
    },
)

num_chunks_2 = await engine.chunk_and_upsert(
    collection_name=COLLECTION_NAME,
    document_path=str(PDF_PATHS[2]),
    include_line_numbers=True,
    metadata={
        "source": str(PDF_PATHS[2].resolve()),
        "filename": PDF_PATHS[2].name,
        "tags": ["llms_can_selfimprove", "pdf", "llm"],
    },
)

In [None]:
    # Check if collection exists
    exists = await engine.collection_exists("demo_collection")
    print(f"Collection exists: {exists}")

    # List all collections
    collections = await engine.list_collections()
    print(f"Collections: {collections}")

    # Get collection info
    info = await engine.get_collection_info("demo_collection")
    print(f"Points: {info['points_count']}")

    # Delete collection
    # await engine.delete_collection("demo_collection")
    # exists = await engine.collection_exists("demo_collection")
    # print(f"Collection exists: {exists}")

In [None]:
# # Delete all chunks of a specific document by its absolute path
# await engine.delete_documents(
#     collection_name=COLLECTION_NAME,
#     documents=str(PDF_PATHS[0].resolve()),
# )
# print(f"Deleted all chunks belonging to document: {PDF_PATHS[0].name}")

# # Delete chunks for multiple documents
# # await engine.delete_documents(
# #     collection_name=COLLECTION_NAME,
# #     documents=[str(p.resolve()) for p in PDF_PATHS[1:]],
# # )
# # print("Deleted all chunks belonging to remaining documents.")

# # Delete by tags (e.g., remove all chunks tagged 'llm')
# await engine.delete_documents(
#     collection_name=COLLECTION_NAME,
#     tags=["llm"],
# )
# print("Deleted chunks with tag='llm'.")




In [None]:

# Existence checks

# Check if a document exists (any chunk with payload["source"] == document path)
doc_path = str(PDF_PATHS[0].resolve())
document_exists = await engine.document_exists(COLLECTION_NAME, doc_path)
print(f"Document exists ({doc_path}): {document_exists}")

# Check if a tag exists (any chunk whose payload['tags'] matches the tag)
tag = "llm"
tag_exists = await engine.tag_exists(COLLECTION_NAME, tag)
print(f"Tag exists ('{tag}'): {tag_exists}")

# Counting

# Count chunks/points matching filter (counts chunks, not unique docs)
tag_filter = {"tags": [tag]}
chunk_count = await engine.count(COLLECTION_NAME, query_filter=tag_filter)
print(f"Chunks with tag '{tag}': {chunk_count}")

# Count documents for a tag (counts unique payload['source'] values)
doc_count_for_tag = await engine.count_documents_for_tag(COLLECTION_NAME, tags=tag)
print(f"Unique documents with tag '{tag}': {doc_count_for_tag}")

# List unique documents (payload['source']) for tags (can pass one or several tags)
docs_with_tags = await engine.documents_for_tag(COLLECTION_NAME, tags=["pdf", "llm"])
print(f"Unique document sources for tags: {docs_with_tags}")




In [None]:
# RAG Search
query = Symbol("How can RAG and Symbolicai be combined with a persistent LLM memory system?")

results = await engine.search(
    collection_name=COLLECTION_NAME,
    query_vector=query.embedding,
    limit=5,
    query_filter={"tags": ["symbolicai"]},
    with_payload=True,
)

print(f"Got {len(results)} results (tag='symbolicai', dict filter)")
for i, r in enumerate(results, start=1):
    payload = r.payload or {}
    excerpt = (payload.get("text") or payload.get("content") or "")[:200].replace("\n", " ")
    src = payload.get("source") or payload.get("url") or payload.get("file_path") or payload.get("path")
    url = engine._resolve_payload_url(payload, COLLECTION_NAME, getattr(r, "id", ""))

    sl = payload.get("chunk_start_line")
    el = payload.get("chunk_end_line")
    sp = payload.get("chunk_start_page")
    ep = payload.get("chunk_end_page")

    print(f"\n[{i}] score={getattr(r, 'score', None)}")
    print(f"source={src}")
    print(f"resolved_url={url}")
    print(f"chunk_pages={sp}-{ep}")
    print(f"chunk_lines={sl}-{el}")
    print(f"tags={payload.get('tags')}")
    print(f"text={excerpt}...")

In [None]:
# Local Search Interface
search = Interface("local_search", index_name=COLLECTION_NAME)

result = search.search(
    "How can RAG and Symbolicai be combined with a persistent LLM memory system?",
    collection_name=COLLECTION_NAME,
    limit=5,
    score_threshold=0.2,
    with_payload=True,
    with_vectors=False,
)

print(result.value)

citations = result.get_citations() if hasattr(result, "get_citations") else getattr(result, "citations", None)
print("\nCitations (PDFs may include #page=N):")
for i, c in enumerate(citations or [], start=1):
    url = getattr(c, "url", None)
    title = getattr(c, "title", None)
    print(f"[{i}] title={title}")
    print(f"    url={url}")


In [None]:
# Chunk provenance demo (PDFs / existing collection)
#
# The chunker stores:
# - `payload["chunk_start_line"]` / `payload["chunk_end_line"]`
# - (when page breaks are available in extracted text) `payload["chunk_start_page"]` / `payload["chunk_end_page"]`
#
# For PDFs, `resolved_url` will include a fragment like `#page=N` when available.

query = Symbol("What is the main idea of the document?")
results = await engine.search(
    collection_name=COLLECTION_NAME,
    query_vector=query.embedding,
    limit=5,
    query_filter={"tags": ["pdf"]},
    with_payload=True,
)

print(f"Got {len(results)} results (tag='pdf')")
for i, r in enumerate(results, start=1):
    payload = r.payload or {}
    filename = payload.get("filename")
    src = payload.get("source")
    url = engine._resolve_payload_url(payload, COLLECTION_NAME, getattr(r, "id", ""))

    sl = payload.get("chunk_start_line")
    el = payload.get("chunk_end_line")
    sp = payload.get("chunk_start_page")
    ep = payload.get("chunk_end_page")

    excerpt = (payload.get("text") or payload.get("content") or "").replace("\n", " ")[:160]

    print(f"\n[{i}] score={getattr(r, 'score', None)}")
    print(f"filename={filename}")
    print(f"source={src}")
    print(f"resolved_url={url}")
    print(f"chunk_pages={sp}-{ep}")
    print(f"chunk_lines={sl}-{el}")
    print(f"text={excerpt}...")

# Same via local_search interface (citations should include #page=N for PDFs when available)
search = Interface("local_search", index_name=COLLECTION_NAME)
result = search.search(
    "main idea",
    collection_name=COLLECTION_NAME,
    limit=5,
    score_threshold=0.2,
    with_payload=True,
    with_vectors=False,
    metadata={"tags": ["pdf"]},
)

print("\nCitations (look for file://...#page=N fragments):")
for i, c in enumerate(result.get_citations() or [], start=1):
    url = getattr(c, "url", None)
    title = getattr(c, "title", None)
    print(f"[{i}] title={title}")
    print(f"    url={url}")
