In [1]:
# Cell 1 - Imports and Setup
from pathlib import Path
import os
from dotenv import load_dotenv
from txtai.embeddings import Embeddings
import logging
import json

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
load_dotenv()

True

# Embeddings Usage Examples

This notebook demonstrates all standard search and management operations with txtai embeddings:
1. Document Operations (add, delete, update)
2. Search Types (semantic, SQL, hybrid)
3. Query Patterns
4. Result Processing

## Cells 2-3: Configuration
The previous notebooks (01_Embeddings_Overview, 02_Content_Storage, 03_Cloud_Storage) explain the configuration options for the embeddings.

In [2]:
# Cell 2 - Standard Configuration
config = {
    "path": "sentence-transformers/nli-mpnet-base-v2",
    "content": True,
    "backend": "faiss",
    "hybrid": True,
    "scoring": {
        "method": "bm25",
        "terms": True,
        "normalize": True
    },
    "normalize": True
}

logger.info("Initializing embeddings...")
embeddings = Embeddings(config)

2024-11-17 15:07:51,283 - INFO - Initializing embeddings...


In [3]:
# Cell 3 - Test Documents
test_docs = [
    {
        "id": "doc1",
        "text": "Machine learning models require significant computational resources",
        "metadata": {
            "category": "tech",
            "tags": ["ML", "computing"],
            "priority": 1
        }
    },
    {
        "id": "doc2",
        "text": "Cloud computing enables scalable infrastructure solutions",
        "metadata": {
            "category": "tech",
            "tags": ["cloud", "infrastructure"],
            "priority": 2
        }
    },
    {
        "id": "doc3",
        "text": "Natural language processing transforms text into structured data",
        "metadata": {
            "category": "tech",
            "tags": ["NLP", "data"],
            "priority": 1
        }
    }
]

# Index documents
logger.info("Indexing test documents...")
embeddings.index([(doc["id"], doc["text"], json.dumps(doc["metadata"])) for doc in test_docs])

2024-11-17 15:07:52,035 - INFO - Indexing test documents...



# Cells 4-6: Standard Search Operations

The embeddings index combines semantic search with SQL to provide a flexible search interface.

We can search using semantic similarity, SQL, or a hybrid of the two.


## Cell 4 - SQL Search

SQL search is like a SQL database.

We are simply filtering the results based on the SQL query.


In [4]:
# Cell 4 - SQL Search
def test_sql_search():
    logger.info("\nTesting SQL Search Operations:")

    # Basic SQL filter
    logger.info("\n1. Basic SQL Filter:")
    basic_query = """
        select text, score
        from txtai
        where metadata like '%technical%'
    """
    sql_results = embeddings.search(
        basic_query,
        limit=1
    )

    # Process results safely
    if sql_results:
        logger.info(f"SQL-filtered Result: {sql_results[0]['text']}")
        logger.info(f"Score: {sql_results[0]['score']}")
    else:
        logger.info("No results found")

    # Complex SQL filter
    logger.info("\n2. Complex SQL Filter:")
    complex_query = """
        select text, score
        from txtai
        where metadata like '%tech%'
        and score > 0.5
        order by score desc
    """
    complex_results = embeddings.search(
        complex_query,
        limit=2
    )

    # Process complex results
    for idx, result in enumerate(complex_results, 1):
        logger.info(f"\nResult {idx}:")
        logger.info(f"Text: {result['text']}")
        logger.info(f"Score: {result['score']}")

test_sql_search()

2024-11-17 15:07:52,205 - INFO - 
Testing SQL Search Operations:
2024-11-17 15:07:52,206 - INFO - 
1. Basic SQL Filter:
2024-11-17 15:07:52,206 - INFO - No results found
2024-11-17 15:07:52,207 - INFO - 
2. Complex SQL Filter:


## Cell 5 - Basic Semantic Search

The basic semantic search option is like RAG without the LLM.

We are just measuring the similarity between the query and the indexed documents, and returning the most similar documents.

In [5]:
# Cell 5 - Basic Search Operations
def test_semantic_search():
    logger.info("\nTesting different search operations:")

    ## Basic Semantic Search
    logger.info("\nBasic Semantic Search:")
    semantic_results = embeddings.search("computational resources for AI", 1)
    logger.info(f"Semantic Search Result: {semantic_results[0]['text']}")
    logger.info(f"Score: {semantic_results[0]['score']}")

test_semantic_search()

2024-11-17 15:07:52,221 - INFO - 
Testing different search operations:
2024-11-17 15:07:52,222 - INFO - 
Basic Semantic Search:
2024-11-17 15:07:52,250 - INFO - Semantic Search Result: Machine learning models require significant computational resources
2024-11-17 15:07:52,250 - INFO - Score: 0.6655701879559184


## Cell 6 - Hybrid Search

Hybrid search combines semantic search with SQL.

It uses the benefits of lakehouse architecture to combine the speed of semantic search with the flexibility of SQL.

Basically, the lakehouse architecture (txtai embeddings) has a single centralized method of managing the embeddings.

This means we can start the query by narrowing the search space using SQL, and then use semantic search to find the most similar documents, all in one query operation.


In [6]:
# Cell 6 - Basic Search Operations
def test_hybrid_search():
    logger.info("\nTesting different search operations:")
    # 2. Hybrid Search (using config with hybrid=True)
    logger.info("\n2. Hybrid Search:")
    hybrid_results = embeddings.search(
        "machine learning infrastructure",
        1
    )
    logger.info(f"Hybrid Search Result: {hybrid_results[0]['text']}")
    logger.info(f"Score: {hybrid_results[0]['score']}")

test_hybrid_search()

2024-11-17 15:07:52,255 - INFO - 
Testing different search operations:
2024-11-17 15:07:52,255 - INFO - 
2. Hybrid Search:
2024-11-17 15:07:52,269 - INFO - Hybrid Search Result: Machine learning models require significant computational resources
2024-11-17 15:07:52,271 - INFO - Score: 0.6688058558998728


# Cells 7-9: Document Management Operations

The document management operations are used to add, update, and delete documents from the index.

In [7]:
# Cell 7 - Add Document Operations
def test_add_document():
    logger.info("\nTesting Add Document Operations:")

    new_doc = {
        "id": "doc4",
        "text": "Data visualization helps understand complex patterns",
        "metadata": {
            "category": "tech",
            "tags": ["visualization", "data"],
            "priority": 2
        }
    }

    # Single document addition
    embeddings.index([(new_doc["id"], new_doc["text"], json.dumps(new_doc["metadata"]))])

    # Verify addition
    results = embeddings.search("visualization patterns", 1)
    logger.info(f"New document search result: {results[0]['text']}")
    logger.info(f"Score: {results[0]['score']}")

test_add_document()

2024-11-17 15:07:52,276 - INFO - 
Testing Add Document Operations:
2024-11-17 15:07:52,297 - INFO - New document search result: Data visualization helps understand complex patterns
2024-11-17 15:07:52,297 - INFO - Score: 0.6688700570231481


# Cell 8 - Document Update Operations

Now we'll test updating an existing document in the index. This includes:
1. Updating the document text
2. Preserving the metadata
3. Verifying the update through search

In [8]:
# Cell 8 - Update Document Operations
def test_update_document():
    logger.info("\nTesting Update Document Operations:")

    # Initial search to show current document
    initial_results = embeddings.search("visualization", 1)
    logger.info(f"Before update - Text: {initial_results[0]['text']}")
    logger.info(f"Before update - Score: {initial_results[0]['score']}")

    # Update by re-indexing the document
    doc_id = "doc4"
    updated_text = "Advanced data visualization reveals hidden patterns"
    updated_metadata = {
        "category": "tech",
        "tags": ["visualization", "data", "advanced"],
        "priority": 1
    }

    # Re-index to update
    embeddings.index([(doc_id, updated_text, json.dumps(updated_metadata))])

    # Verify update
    results = embeddings.search("advanced visualization", 1)
    logger.info(f"After update - Text: {results[0]['text']}")
    logger.info(f"After update - Score: {results[0]['score']}")

test_update_document()

2024-11-17 15:07:52,304 - INFO - 
Testing Update Document Operations:
2024-11-17 15:07:52,318 - INFO - Before update - Text: Data visualization helps understand complex patterns
2024-11-17 15:07:52,321 - INFO - Before update - Score: 0.5180126488468576
2024-11-17 15:07:52,343 - INFO - After update - Text: Advanced data visualization reveals hidden patterns
2024-11-17 15:07:52,343 - INFO - After update - Score: 0.6751291705733342


# Document Delete Operations

Finally, we'll test document deletion:
1. Delete a specific document
2. Verify the deletion through search
3. Confirm the document is no longer retrievable

In [9]:
# Cell 9 - Delete Document Operations
def test_delete_document():
    logger.info("\nTesting Delete Document Operations:")

    # Initial search to confirm document exists
    initial_results = embeddings.search("visualization", 1)
    logger.info(f"Before deletion - Text: {initial_results[0]['text']}")
    logger.info(f"Before deletion - Score: {initial_results[0]['score']}")

    # Delete document
    doc_id = "doc4"
    embeddings.delete([doc_id])
    logger.info(f"Deleted document with ID: {doc_id}")

    # Verify deletion by searching for the same content
    post_results = embeddings.search("visualization", 1)
    if post_results and post_results[0]['score'] < initial_results[0]['score']:
        logger.info("Document successfully deleted")
        logger.info(f"New top result - Text: {post_results[0]['text']}")
        logger.info(f"New top result - Score: {post_results[0]['score']}")
    else:
        logger.info("No matching results found after deletion")

test_delete_document()

2024-11-17 15:07:52,351 - INFO - 
Testing Delete Document Operations:
2024-11-17 15:07:52,370 - INFO - Before deletion - Text: Advanced data visualization reveals hidden patterns
2024-11-17 15:07:52,370 - INFO - Before deletion - Score: 0.47253335120891815
2024-11-17 15:07:52,371 - INFO - Deleted document with ID: doc4
2024-11-17 15:07:52,390 - INFO - No matching results found after deletion
